In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split

* Running Dependencies

In [2]:
%run partA_utilities.ipynb

* Defining Mode and creating directory

In [3]:
mode = "prd"

make_mode_file(mode)

Directory for mode: prd already exists


* Defining Settings

In [4]:
settings = Settings(mode = mode)

* Main Preprocessing

In [5]:
train_data = pd.read_csv(settings.train_data_path)
test_data = pd.read_csv(settings.test_data_path)

* Defining Preprocessing Object

In [6]:
preprocessor = DataPreprocessing(mode=mode, settings=settings)

* Creating Titles and Themes

In [7]:
train_data = preprocessor.create_site_titles_and_themes(train_data)
test_data = preprocessor.create_site_titles_and_themes(test_data)

* Removing stopwords and punctuation

In [8]:
train_data = preprocessor.remove_stopwords_and_punctuation(train_data)
test_data = preprocessor.remove_stopwords_and_punctuation(test_data)

* Removing lemmatization

In [9]:
train_data = preprocessor.lemmatization(train_data)
test_data = preprocessor.lemmatization(test_data)

* Constructing Embeddings

In [10]:
train_data = preprocessor.construct_embeddings(train_data)
test_data = preprocessor.construct_embeddings(test_data)

* Encoding Targeting Labels

In [11]:
train_data = preprocessor.encode_target_labels(train_data)
test_data = preprocessor.encode_target_labels(test_data)

* Splitting to Training and Validation Sets on train set:

* The validation set will serve as a pre-step, before applying the final model to the test set

In [12]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop('label', axis = 1), 
                                                  train_data['label'], 
                                                  test_size=0.33, 
                                                  random_state=42
                                                  )

* Writing to sinks all of the processed sets of data

In [13]:
preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/train_data_processed.csv',
                            dataset = train_data)

preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/test_data_processed.csv',
                            dataset = test_data)

preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/X_train.csv',
                            dataset = X_train)

preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/X_val.csv',
                            dataset = X_val)

preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/y_train.csv',
                            dataset = y_train)

preprocessor.write_to_sinks(write_path = settings.sources_sinks_path + '/y_val.csv',
                            dataset = y_val)

preprocessor.write_mapping_dataframe_to_sinks()