# Create NEWSGROUP Dataset

# Imports

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups

## Parameters

In [2]:
TRAIN_CSV = '../../Data/NEWSGROUPS/train/newsgroups_train.csv'
TEST_CSV = '../../Data/NEWSGROUPS/test/newsgroups_test.csv'
TRAIN_VAL_SPLIT_RATIO = 0.9

## Helping Functions

In [3]:
def clean_text(text, special_chars=["\n", "\t"]):
    for char in special_chars:
        text = text.replace(char, " ")
    return text

In [7]:
def get_data_indices(subset):
        np.random.seed(0)
    
        newsgroups_data = fetch_20newsgroups(
                subset=subset, remove=("headers", "footers", "quotes"))
    
        data_indices = np.array(range(len(newsgroups_data.data))) 
        np.random.shuffle(data_indices)
    
        return data_indices, newsgroups_data

In [5]:
def get_inputs(subset):
        np.random.seed(0)
        data_indices, newsgroups_data = get_data_indices(subset)
        strings = [None] * len(data_indices)
        labels = [None] * len(data_indices)
        for i, idx in enumerate(data_indices):
            txt = newsgroups_data.data[idx]
            topic = newsgroups_data.target[idx]
            label = newsgroups_data['target_names'][topic].split(".")[0]
            txt = clean_text(txt, special_chars=["\n", "\t"])
            if len(txt) == 0 or len(label) == 0:
                strings[i] = None
                labels[i] = None
            else:
                strings[i] = txt
                labels[i] = label

        strings = [x for x in strings if x is not None]
        labels = [x for x in labels if x is not None]
        assert len(strings) == len(labels)
    
        return strings, labels

## Create Training CSV

In [8]:
text, labels = get_inputs('train')

In [9]:
train_df = pd.DataFrame({
    'text': text,
    'labels': labels
})

print(train_df.head())
print("\nTraining Dataframe has shape: {}".format(train_df.shape))

                                                text labels
0  [Most info regarding dangers of reading from F...   comp
1  Attention hardware hackers and bargain seekers...    sci
2  A friend's daughter has been diagnosed with an...    sci
3  WHile we are on the subject of the shuttle sof...    sci
4    That is great to hear I just may have to tak...    rec

Training Dataframe has shape: (11096, 2)


In [20]:
TRAIN_DIR = '/'.join(TRAIN_CSV.split('/')[:-1])
if not os.path.exists(TRAIN_DIR):
    os.makedirs(TRAIN_DIR)
    
train_df.to_csv(TRAIN_CSV, index=False)

## Create Testing CSV

In [22]:
text, labels = get_inputs('test')

In [23]:
test_df = pd.DataFrame({
    'text': text,
    'labels': labels
})

print(test_df.head())
print("\nTesting Dataframe has shape: {}".format(test_df.shape))

                                                text labels
0  David Hammerslag asked:   How do you (Mormons)...    soc
1  In the article "At last! Now you can talk to y...    sci
2  Sci med people:   Can I sell my TENS unit or d...    sci
3  [...]  OK Steve, here's a sketch of an alterna...    sci
4    Low oil pressure, usually.  Could be your oi...    rec

Testing Dataframe has shape: (7370, 2)


In [30]:
TEST_DIR = '/'.join(TEST_CSV.split('/')[:-1])
if not os.path.exists(TEST_DIR):
    os.makedirs(TEST_DIR)
    
train_df.to_csv(TEST_CSV, index=False)