# Data Preparation for classifying news categories with a Convolutional Neural Network (NEW)

In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace
from matplotlib import pyplot as plt

In [2]:
args = Namespace(
    raw_dataset_csv_1="../data/original/ag_news_train.csv",
    raw_dataset_csv_2="../data/original/ag_news_test.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="../data/processed/News_Category_Dataset_with_splits.csv",
    seed=1337
)

In [3]:
# Read raw data
news_1 = pd.read_csv(args.raw_dataset_csv_1, header=0)
news_1 = news_1.sample(5000)
news_2 = pd.read_csv(args.raw_dataset_csv_2, header=0)
news = pd.merge(news_1, news_2, how="outer")
news["text"] = news["Title"] + ". " + news["Description"]
news.rename(columns={"Class Index":"category"}, inplace=True)
news.drop(axis=1, columns=["Title", "Description"], inplace=True)
news

Unnamed: 0,category,text
0,3,Patience Helps Build Trust in Land Deals. Deve...
1,4,Alienware #39;s 4GHz Pentium. I love overclock...
2,4,HDS Launches TagmaStore. When it comes to stor...
3,4,Ion-Propulsion Craft Reaches The Moon. Rollie ...
4,4,Dumbing Down Smart Objects. Forget about cumbe...
...,...,...
12595,1,Around the world. Ukrainian presidential candi...
12596,2,Void is filled with Clement. With the supply o...
12597,2,Martinez leaves bitter. Like Roger Clemens did...
12598,3,5 of arthritis patients in Singapore take Bext...


In [4]:
# Splitting train by category
# Create dict
label_dict = {1:"World", 2:"Sports", 3:"Business", 4:"Sci/Tech"}
by_label = collections.defaultdict(list)
length_list = []
for _, row in news.iterrows():
    length_list.append(len(row.text))
    row.category = label_dict[row.category]
    by_label[row.category].append(row.to_dict())
print(max(length_list))

893


In [None]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_label.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [None]:
# Write split data to file
final_news = pd.DataFrame(final_list)

In [None]:
final_news.split.value_counts()

split
train    8818
test     1894
val      1888
Name: count, dtype: int64

In [None]:
final_news.head()

Unnamed: 0,category,text,split
0,Business,Kroger's Profit Up; Price Cuts Weigh. NEW YOR...,train
1,Business,Goldman Sachs Enters Fray for Takefuji. Goldma...,train
2,Business,Finance: Losing the Right to Sue. WASHINGTON ...,train
3,Business,"Aussies battle EU over cheese, champagne. AP -...",train
4,Business,EU foreign ministers hope to break deadlock ov...,train


In [None]:
# Write munged data to CSV
final_news.to_csv(args.output_munged_csv, index=False)