# Preprocessor

In [11]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
import os
import shutil
import numpy as np
import pandas as pd

from sklearn.utils import shuffle

# from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [8]:
# Copy the raw data from GCS bucket, unzip it
! gsutil -m cp -r gs://text-analysis-323506/data /home/jupyter
! unzip /home/jupyter/data/data_archive.zip 
! bzip2 -d train.ft.txt.bz2 
! bzip2 -d test.ft.txt.bz2
! mv *.ft.txt /home/jupyter/data/

Copying gs://text-analysis-323506/data/data_archive.zip...
Archive:  /home/jupyter/data/data_archive.zip                                   
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [13]:
train_file = "/home/jupyter/data/train.ft.txt"
test_file = "/home/jupyter/data/test.ft.txt"

dest_folder = "/home/jupyter/train_data"

if not os.path.exists(dest_folder):
    os.makedirs(dest_folder, exist_ok=True)

In [14]:
def load_data(file_name):
    data = []
    with open(file_name, 'r') as fstream:
        data = fstream.readlines()
    fstream.close()
    return data

In [15]:
# load the text files
train_data = load_data(train_file)
test_data = load_data(test_file)

In [16]:
# An entry from train file
train_data[0]

'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [17]:
def get_data_labels(text_lines):
    data = []
    labels = []
    
    for tline in text_lines:
        tline = tline.split(maxsplit=1)
        data.append(tline[1])
        label = 1 if tline[0] == '__label__1' else 0
        labels.append(label)
    return data, labels

In [18]:
# Splits the data in original text file into texts and labels
train_input, train_labels = get_data_labels(train_data)
test_input, test_labels = get_data_labels(test_data)

In [19]:
# Shuffle the train set
train_data_shuffled, train_labels_shuffled = shuffle(train_input, train_labels)

In [20]:
# Split train set into traina dn validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data_shuffled, train_labels_shuffled, test_size=0.2)

In [21]:
# Converting train, test, validation sets into dataframes for easy access later during training
train_df = pd.DataFrame({'input': X_train, 'labels': y_train})
val_df = pd.DataFrame({'input': X_val, 'labels': y_val})
test_df = pd.DataFrame({'input': test_input, 'labels': test_labels})

In [24]:
# Shapes of each set
print(f"Shape of train data: {train_df.shape}")
print(f"Shape of validation data: {val_df.shape}")
print(f"Shape of train data: {test_df.shape}")

Shape of train data: (2880000, 2)
Shape of validation data: (720000, 2)
Shape of train data: (400000, 2)


In [25]:
# Save dataframes as csv files. Gzip compression is applied to reduce the size of each csv file
train_df.to_csv(os.path.join(dest_folder, "train_text.csv.gz"), index=False, compression='gzip')
val_df.to_csv(os.path.join(dest_folder, "val_text.csv.gz"), index=False, compression='gzip')
test_df.to_csv(os.path.join(dest_folder, "test_text.csv.gz"), index=False, compression='gzip')

In [26]:
# If required, move the final train dataset back to GCS bucket. To train using AI Platform, train data needs to be in a GCS bucket.
! mv /home/jupyter/train_data ./
! zip -r train_data.zip ./train_data
! gsutil -m mv train_data.zip gs://text-analysis-323506/

  adding: home/jupyter/train_data/ (stored 0%)
Copying file://train_data.zip [Content-Type=application/zip]...
Removing file://train_data.zip...] 100% Done                                    

Operation completed over 1 objects/198.0 B.                                      
