# Dataset Sampling (Different Sizes)

We will indicate different sizes of dataset by the terms **30K, 10K, 5K** and **1K**, and these will be the names of
respective folders in the data folder.

_**Note:** 30K size dataset is sampled from the original dataset, 
and only content having word count more than 20 and less than 1000 is taken._

In [3]:
import os
import pandas as pd
from glob import glob
from tqdm.auto import tqdm

In [5]:
from preprocessor import *

In [6]:
# Get sample from 'df' of 'size' with 'seed'
def get_sample(df, size, seed, show_info=True):
    """Get sample dataframe from Pandas dataframe.
    
    The sampled dataframe will have equal number of rows in each class.

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to be sampled from.
    size : int
        Number or rows to sample from each class.
    seed : int
        Sampling randomness (seed) value.
    show_info : bool, optional
        Show debug info, by default True

    Returns
    -------
    pd.DataFrame
        Sampled dataframe.
    """
    new_df = None
    
    # Get samples from each category
    for category in df.category.unique():
        sample = df[df.category == category].sample(n=size, random_state=seed)
        if new_df is None:
            new_df = sample
        else:
            new_df = pd.concat([new_df, sample])
    new_df.reset_index(inplace=True, drop=True)

    if show_info:
        print(f'Total Rows: {len(new_df)}, for {size} rows per class.')

    return new_df

In [7]:
dataset_path = os.path.join(os.getcwd(), 'dataset')

In [8]:
paths = glob(f'{dataset_path}/30K/*.csv')

In [9]:
paths

['/home/suyogyat/research/dataset/30K/np20ng_30K_train_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_train.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K_test.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_test_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_valid.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_train_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K_test_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_valid_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K_valid_nosw.csv',
 '/home/suyogyat/research/dataset/30K/np16ng_30K_test.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K_valid.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K_train.csv',
 '/home/suyogyat/research/dataset/30K/np20ng_30K.csv']

### Reading 30K dataset having 20 classes and 16 classes

In [10]:
df20 = pd.read_csv('/home/suyogyat/research/dataset/30K/np20ng_30K.csv')
df16 = pd.read_csv('/home/suyogyat/research/dataset/30K/np16ng_30K.csv')
len(df20), len(df16)

(30000, 24000)

### Sampling each datasets with 3 more data sizes

In [11]:
df20_10K = get_sample(df=df20, size=10000//20, seed=44)
df20_5K = get_sample(df=df20, size=5000//20, seed=44)
df20_1K = get_sample(df=df20, size=1000//20, seed=44)

df16_10K = get_sample(df=df16, size=500, seed=44)
df16_5K = get_sample(df=df16, size=250, seed=44)
df16_1K = get_sample(df=df16, size=50, seed=44)

Total Rows: 10000, for 500 rows per class.
Total Rows: 5000, for 250 rows per class.
Total Rows: 1000, for 50 rows per class.
Total Rows: 8000, for 500 rows per class.
Total Rows: 4000, for 250 rows per class.
Total Rows: 800, for 50 rows per class.


### Exporting each datasets

In [12]:
export_df(df20_10K, '/home/suyogyat/research/dataset/10K/np20ng_10K.csv')
export_df(df20_5K, '/home/suyogyat/research/dataset/5K/np20ng_5K.csv')
export_df(df20_1K, '/home/suyogyat/research/dataset/1K/np20ng_1K.csv')

export_df(df16_10K, '/home/suyogyat/research/dataset/10K/np16ng_10K.csv')
export_df(df16_5K, '/home/suyogyat/research/dataset/5K/np16ng_5K.csv')
export_df(df16_1K, '/home/suyogyat/research/dataset/1K/np16ng_1K.csv')

Exported 10000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K.csv.
Exported 5000 rows to path /home/suyogyat/research/dataset/5K/np20ng_5K.csv.
Exported 1000 rows to path /home/suyogyat/research/dataset/1K/np20ng_1K.csv.
Exported 8000 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K.csv.
Exported 4000 rows to path /home/suyogyat/research/dataset/5K/np16ng_5K.csv.
Exported 800 rows to path /home/suyogyat/research/dataset/1K/np16ng_1K.csv.


# Train Validation and Test Splits

Following is an example to split data size of **10K** for both 20 classes and 16 classes in the ratio of 80:10:10

**_train, _valid** and **_test** are suffixed to the split files.

In [13]:
from sklearn.model_selection import train_test_split

In [16]:
dataset = 'np20ng'
size = '10K'

# Splitting the dataframe into 80:10:10 ratio
train, test = train_test_split(df20_10K, test_size=0.2, random_state=44, stratify=df20_10K.category)
valid, test = train_test_split(test, test_size=0.5, random_state=44, stratify=test.category)

# Exporting into respective directories
export_df(train, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_train.csv')
export_df(valid, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_valid.csv')
export_df(test, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_test.csv')


dataset = 'np16ng'
size = '10K'

# Splitting the dataframe into 80:10:10 ratio
train, test = train_test_split(df16_10K, test_size=0.2, random_state=44, stratify=df16_10K.category)
valid, test = train_test_split(test, test_size=0.5, random_state=44, stratify=test.category)

# Exporting into respective directories
export_df(train, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_train.csv')
export_df(valid, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_valid.csv')
export_df(test, f'/home/suyogyat/research/dataset/{size}/{dataset}_{size}_test.csv')

Exported 8000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_train.csv.
Exported 1000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_valid.csv.
Exported 1000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_test.csv.
Exported 6400 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_train.csv.
Exported 800 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_valid.csv.
Exported 800 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_test.csv.


In [17]:
os.getcwd()

'/home/suyogyat/research'

### Removing stop words

**_nosw** is added to the files with stop words removed.

In [18]:
from nepali_stemmer.stemmer import NepStemmer
stemmer = NepStemmer()

# Load Nepali stop words
with open('utils/nepali_stopwords.txt') as f:
    nepali_stopwords = [word.strip() for word in f.readlines()]

def remove_stopwords(text):
    
    #stemming
    text = stemmer.stem(text)
    
    #remove stopwords
    text = ' '.join([word.strip() for word in text.split() if word not in nepali_stopwords])
    
    return text

In [19]:
paths = glob(f'{dataset_path}/10K/*.csv')
paths = paths + glob(f'{dataset_path}/5K/*.csv')
paths = paths + glob(f'{dataset_path}/1K/*.csv')
paths

['/home/suyogyat/research/dataset/10K/np20ng_10K_test.csv',
 '/home/suyogyat/research/dataset/10K/np20ng_10K_train.csv',
 '/home/suyogyat/research/dataset/10K/np16ng_10K.csv',
 '/home/suyogyat/research/dataset/10K/np20ng_10K.csv',
 '/home/suyogyat/research/dataset/10K/np16ng_10K_train.csv',
 '/home/suyogyat/research/dataset/10K/np16ng_10K_valid.csv',
 '/home/suyogyat/research/dataset/10K/np16ng_10K_test.csv',
 '/home/suyogyat/research/dataset/10K/np20ng_10K_valid.csv',
 '/home/suyogyat/research/dataset/5K/np20ng_5K_valid.csv',
 '/home/suyogyat/research/dataset/5K/np16ng_5K_test.csv',
 '/home/suyogyat/research/dataset/5K/np20ng_5K_test.csv',
 '/home/suyogyat/research/dataset/5K/np16ng_5K_valid.csv',
 '/home/suyogyat/research/dataset/5K/np20ng_5K.csv',
 '/home/suyogyat/research/dataset/5K/np16ng_5K_train.csv',
 '/home/suyogyat/research/dataset/5K/np16ng_5K.csv',
 '/home/suyogyat/research/dataset/5K/np20ng_5K_train.csv',
 '/home/suyogyat/research/dataset/1K/np20ng_1K_train.csv',
 '/home/s

In [20]:
for path in paths:
    df = pd.read_csv(path)
    df.content = df.content.apply(remove_stopwords)
    df.content_word_count = df.content.str.split().str.len()
    
    export_df(df, path.replace('.csv', '_nosw.csv'))

Exported 1000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_test_nosw.csv.
Exported 8000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_train_nosw.csv.
Exported 8000 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_nosw.csv.
Exported 10000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_nosw.csv.
Exported 6400 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_train_nosw.csv.
Exported 800 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_valid_nosw.csv.
Exported 800 rows to path /home/suyogyat/research/dataset/10K/np16ng_10K_test_nosw.csv.
Exported 1000 rows to path /home/suyogyat/research/dataset/10K/np20ng_10K_valid_nosw.csv.
Exported 500 rows to path /home/suyogyat/research/dataset/5K/np20ng_5K_valid_nosw.csv.
Exported 400 rows to path /home/suyogyat/research/dataset/5K/np16ng_5K_test_nosw.csv.
Exported 500 rows to path /home/suyogyat/research/dataset/5K/np20ng_5K_test_nosw.csv.
Exported 400 rows to path /home/suyo