# Tokenization of the Raw Text Data

In [1]:
import pandas as pd

# Path to the gzip-compressed JSON Lines file
data_path = 'Data/data.jsonl.gz'

# Load the compressed JSON Lines file into a Pandas DataFrame
data_df = pd.read_json(data_path, lines=True, compression='gzip')

# Assign the features and target variable
X = data_df['text']  
y = data_df['label'] 


## Tokenize the Text

In [2]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

# Tokenize the text and print the instance to check the loading time
tokenized_texts = []
count = 0
for text in X:
    tokenized_texts.append(word_tokenize(text))
    if count % 100000 == 0:
        print(count)
    count += 1
    


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carlosrabat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carlosrabat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0
100000
200000
300000
400000


In [3]:
# Process the Data, get rid of stop words, lower case the words and stem the tokens
# Stemer is to reduce the words to their base form
processed_texts = []
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

for text in tokenized_texts:
    current_tokens = []
    for token in text:
        token.lower()
        if token not in stop_words:
            stemmer.stem(token)
            current_tokens.append(token)
    processed_texts.append(current_tokens)

## Example of Raw vs Processed

In [4]:
print(X[0])
print(processed_texts[0])

i feel awful about it too because it s my job to get him in a position to succeed and it just didn t happen here
['feel', 'awful', 'job', 'get', 'position', 'succeed', 'happen']


## Training Set and Test Set

In [5]:
from sklearn.model_selection import train_test_split


# Split into training and test
# Use random state 49 for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(processed_texts, y, test_size=0.2, random_state=49)


# Training Data
df_train = pd.DataFrame({
    'X_train': X_train,
    'y_train': y_train
})

# Test Data
df_test = pd.DataFrame({
    'X_test': X_test,
    'y_test': y_test
})

# Write to a csv file
df_train.to_csv('train_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)


## Example on how to Load them 

In [6]:
# Load training data
train_data = pd.read_csv('train_data.csv')
X_train = train_data['X_train'].tolist()
y_train = train_data['y_train'].tolist()

# Load test data
test_data = pd.read_csv('test_data.csv')
X_test = test_data['X_test'].tolist()
y_test = test_data['y_test'].tolist()