# Data Preprocessing

## Goal:
* Load original data
* Split the data to train(64%), test(20%), and validation(16%)
* Write the data to new .csv files

In [1]:
import os

DATA_PATH = os.path.join(os.getcwd(), 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw_data')

In [2]:
raw_train_data_path = os.path.join(RAW_DATA_PATH, 'train.csv')
raw_test_data_path = os.path.join(RAW_DATA_PATH, 'test.csv')

In [3]:
import pandas as pd

df_raw_train = pd.read_csv(raw_train_data_path, keep_default_na=False)[['text', 'sentiment']]
df_raw_test = pd.read_csv(raw_test_data_path, keep_default_na=False)[['text', 'sentiment']]
df_raw = pd.concat([df_raw_train, df_raw_test])

In [5]:
df_raw['sentiment_class'] = df_raw['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

In [6]:
print(f"Total entries in the raw data: {df_raw.shape[0]}")

Total entries in the raw data: 31015


In [7]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

df_train, df_test = train_test_split(df_raw, test_size=0.2, random_state=RANDOM_SEED)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=RANDOM_SEED)

In [8]:
print(f"Entries in train data: {df_train.shape[0]}")
print(f"Entries in test data: {df_test.shape[0]}")
print(f"Entries in validation data: {df_val.shape[0]}")

Entries in train data: 19849
Entries in test data: 6203
Entries in validation data: 4963


In [9]:
train_data_path = os.path.join(DATA_PATH, 'train.csv')
test_data_path = os.path.join(DATA_PATH, 'test.csv')
validation_data_path = os.path.join(DATA_PATH, 'validation.csv')

In [10]:
df_train.to_csv(train_data_path, index=False)
df_test.to_csv(test_data_path, index=False)
df_val.to_csv(validation_data_path, index=False)