# Processing

### Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import pickle
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

### Reading Data

In [2]:
def compile_data(train_path): 
    train = pd.read_csv(train_path)
    train_x = list(train['Text'])
    train_y = train['Category']
    label_map = {label: code for label, code in zip(train_y, train_y.astype('category').cat.codes)}
    train_y = train['Category'].astype('category').cat.codes

    label_map = dict(sorted(label_map.items(), key = lambda item: item[1]))

    return train_x, train_y, label_map

dir = r'C:\Users\deguz\OneDrive\PET_PROJECTS\Multiclass_Classification-News_Articles'
train_x, train_y, label_map  = compile_data(dir + '\BBC News Train.csv')

In [3]:
#Sample instance
train_x[0][0:200]

'worldcom ex-boss launches defence lawyers defending former worldcom chief bernie ebbers against a battery of fraud charges have called a company whistleblower as their first witness.  cynthia cooper  '

In [4]:
#Sample label
train_y[0]

0

In [5]:
#Classes
label_map

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}

### Cleaning: Punctuation, Stopwords, Tokenization --> Train/Test

In [6]:
"""PROCESSING ====================================================="""
def clean_text(samples, n_words = 45):
    stopwords = set(nltk.corpus.stopwords.words('english'))

    cleaned = []
    for sample in samples: 
        words = nltk.RegexpTokenizer("['\w]+").tokenize(sample) #Punc + Tokenize
        words = [word.lower() for word in words if word.lower() not in stopwords] #Stopwords
        cleaned.append(words)

    return cleaned

train_x = clean_text(train_x)
train_x[0][:5]

['worldcom', 'ex', 'boss', 'launches', 'defence']

In [7]:
"""TRAIN/TEST SPLIT - to be used across all models"""
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size = 0.2)

names = ['train_x', 'test_x', 'train_y', 'test_y', 'label_map']
vars = [train_x, test_x, train_y, test_y, label_map]

overwrite_train_test = False
if overwrite_train_test: 
    for n, v in zip(names, vars): 
        with open(f'vars/{n}.pkl', 'wb') as f: 
            pickle.dump(v, f)

In [8]:
# Dataframe to store model results
file_name = 'results.csv'
if not os.path.exists(file_name):
    results = pd.DataFrame(columns = ['model', 'accuracy', 'weighted_f1'])
    results.to_csv(file_name, index=False)

### Custom Word Vectors (Gensim)

In [5]:
import process_funcs as pf
train_x, test_x, train_y, test_y, label_map  = pf.get_train_test()

overwrite_vecs = False
if overwrite_vecs: 
    vec_model = Word2Vec(sentences = train_x, vector_size = 100, min_count = 2, workers = 4)
    vec_model.save("embeddings/custom_embeddings.model")