In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re, string, snowballstemmer
import contractions

In [3]:
ds = load_dataset("sh0416/ag_news")

In [4]:
ds_train = ds['train']
ds_test = ds['test']

In [5]:
type(ds_train), type(ds_test)

(datasets.arrow_dataset.Dataset, datasets.arrow_dataset.Dataset)

In [6]:
df_train = ds_train.to_pandas()
df_test = ds_test.to_pandas()

In [7]:
type(df_train), type(df_test)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [8]:
df_train.shape, df_test.shape

((120000, 3), (7600, 3))

In [9]:
df = pd.concat([df_train, df_test])

In [10]:
df.shape

(127600, 3)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 127600 entries, 0 to 7599
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   label        127600 non-null  int64 
 1   title        127600 non-null  object
 2   description  127600 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.9+ MB


In [12]:
df.head()

Unnamed: 0,label,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [13]:
df.isnull().sum()

label          0
title          0
description    0
dtype: int64

In [14]:
df.dropna(inplace=True)

In [15]:
df.duplicated().sum()

np.int64(0)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.sample(5)

Unnamed: 0,label,title,description
12772,2,"Mariners 7, Royals 5",Bret Boone hit a go-ahead two-run homer in the...
107729,3,8 Accused of Inflating Kmart Profit,The Securities and Exchange Commission yesterd...
11494,2,No Mistaking Identity,"LaVar Arrington, Washington D.C.'s most popula..."
34745,4,Sun: We've turned over a new leaf,The server giant failed to deliver what custom...
2997,1,"Pakistan Ups Security, Shi #39;ites Mourn Bomb...",Pakistan beefed up security Saturday as minori...


In [18]:
df = df.drop('title', axis = 1)

In [19]:
df.sample(5)

Unnamed: 0,label,description
68617,2,"quot;What a Choke, #39; #39; quot;Hell Freez..."
22546,4,IBM plans to announce today that it is getting...
105009,2,ON-LOAN Liverpool striker El Hadji Diouf is fa...
83399,2,Minnesota Vikings receiver Randy Moss will sit...
15546,4,"Reuters - Apple Computer unveiled, after a\two..."


In [20]:
X = df['description']
y = df['label']

In [21]:
X.shape, y.shape

((127600,), (127600,))

In [22]:
PUNCTUATION_TRANSLATOR = str.maketrans('','',string.punctuation)

In [23]:
def normalize_document(document):
    document = document.lower()
    document = contractions.fix(document)
    document = document.translate(PUNCTUATION_TRANSLATOR)
    document = " ".join(document.split())
    return document

In [24]:
X = X.apply(normalize_document)

In [25]:
X.head()

0    reuters shortsellers wall streets dwindlingban...
1    reuters private investment firm carlyle groupw...
2    reuters soaring crude prices plus worriesabout...
3    reuters authorities have halted oil exportflow...
4    afp tearaway world oil prices toppling records...
Name: description, dtype: object

In [26]:
y.head()

0    3
1    3
2    3
3    3
4    3
Name: label, dtype: int64

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
normalized_df = pd.DataFrame({'X':X, 'y': y})

In [30]:
normalized_df.head()

Unnamed: 0,X,y
0,reuters shortsellers wall streets dwindlingban...,3
1,reuters private investment firm carlyle groupw...,3
2,reuters soaring crude prices plus worriesabout...,3
3,reuters authorities have halted oil exportflow...,3
4,afp tearaway world oil prices toppling records...,3


In [39]:
# train, temp = train_test_split(normalized_df, test_size=0.3, random_state=42, stratify=df['y']) --------- Attempt 1 : to Split train and temp (To further split to val and test)
train, temp = train_test_split(normalized_df, test_size=0.3, random_state=42, stratify=normalized_df['y'])

In [40]:
train.shape, temp.shape

((89320, 2), (38280, 2))

In [41]:
# val, test = train_test_split(train, test_size=0.5, random_state=42, stratify=train['y']) -------- Attempt 1 : to split val and test -> wrong value passed to split
val, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['y'])

In [43]:
train.shape, val.shape, test.shape

((89320, 2), (19140, 2), (19140, 2))

In [45]:
train.to_csv('train_data.csv', index=False)
val.to_csv('val_data.csv', index=False)
test.to_csv('test_data.csv', index=False)