## PREPROCESSING AND PREPARING DATA FOR MODELS

<!-- -- -->

In [19]:
import pandas as pd 
import numpy as np

import os 
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("/Users/sarthaksharna/AutoNews/data/cleaned/cleaned_data.csv")
df.head()

Unnamed: 0,text,label,cleaned_text
0,Musicians to tackle US red tape\n\nMusicians' ...,entertainment,musicians to tackle us red tape musicians grou...
1,"U2's desire to be number one\n\nU2, who have w...",entertainment,u2s desire to be number one u2 who have won th...
2,Rocker Doherty in on-stage fight\n\nRock singe...,entertainment,rocker doherty in onstage fight rock singer pe...
3,Snicket tops US box office chart\n\nThe film a...,entertainment,snicket tops us box office chart the film adap...
4,Ocean's Twelve raids box office\n\nOcean's Twe...,entertainment,oceans twelve raids box office oceans twelve t...


<!-- --- -->

In [9]:
ARTIFACTS_PATH = "../artifacts"
os.makedirs(ARTIFACTS_PATH, exist_ok = True)

In [None]:
VOCAB_SIZE = 5000 
MAX_LEN = 200

<!-- -- -->

In [3]:
X = df["cleaned_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1701,), (426,), (1701,), (426,))

In [11]:
y_train.value_counts()

label
sport            404
business         402
politics         322
entertainment    295
tech             278
Name: count, dtype: int64

<!-- ---- -->

SAVING LABEL ENCODER

In [None]:
def label_encoder(y_train, y_test):
    """
    This function encodes the labels using LabelEncoder()
    """

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    return y_train, y_test, le

y_train, y_test, le = label_encoder(y_train, y_test)
y_train, y_test

with open(os.path.join(ARTIFACTS_PATH, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)


In [59]:
np.save(os.path.join(ARTIFACTS_PATH, "y_train.npy"), y_train)
np.save(os.path.join(ARTIFACTS_PATH, "y_test.npy"), y_test)

print("✓ Encoded labels saved!")

✓ Encoded labels saved!


In [22]:
print("class mapping: ")
label_map = {label: idx for idx, label in enumerate(le.classes_)}
label_map

class mapping: 


{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}

<!-- --- -->

SAVING TOKENIZER

In [None]:
def tokenization(X_train, X_test):
    """This function tokenizes the text data"""

    tokenizer = Tokenizer(num_words = VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    return X_train_seq, X_test_seq, tokenizer


X_train_seq, X_test_seq, tokenizer = tokenization(X_train, X_test)

with open(os.path.join(ARTIFACTS_PATH, "tokenizer.pkl"), "wb") as f:
    pickle.dump(tokenizer, f)


In [31]:
print(X_train.iloc[0])
print(X_train_seq[0])

whitehall cuts ahead of target thousands of civil service jobs have already been cut or moved out of london as part of a major costcutting drive chancellor gordon brown said 12500 jobs had gone while 7800 were being moved out of the south east he plans to axe 104000 jobs to free up money for education health defence housing and overseas aid unions oppose the plans but mr brown said 2bn savings had already been made and more jobs had been cut than had been expected at this stage a further 200 jobs at the department of the environment food and rural affairs have been earmarked to be cut at the department for work and pensions 30000 jobs are to go 560 will be lost by the end of the month at the department of trade and industry and 400 are to go at the inland revenue and customs in his budget statement the chancellor said the first 12500 civil service jobs had been cut on target about 4300 civil servants will leave london and the south east by the end of march 2005 and there are plans to r

In [43]:
print("Total unique words in corpus: ")
len(tokenizer.word_index)

Total unique words in corpus: 


30808

In [53]:
print("Index assigned to '<OOV>' token: ")
tokenizer.word_index["<OOV>"]

Index assigned to '<OOV>' token: 


1

In [49]:
print("Top 10 most frequent words: ")
for i in range(1, 11):
    print(f"{i}: '{tokenizer.index_word[i]}'")

Top 10 most frequent words: 
1: '<OOV>'
2: 'the'
3: 'to'
4: 'of'
5: 'and'
6: 'a'
7: 'in'
8: 'for'
9: 'is'
10: 'that'


In [None]:
total_words = len(tokenizer.word_index)

used_words = VOCAB_SIZE

oov_words = total_words - used_words

print(f"Total unique words: {total_words}")
print(f"Words used in vocab: {used_words}")
print(f"Words treated as OOV: {oov_words}")

Total unique words: 30808
Words used in vocab: 5000
Words treated as OOV: 25808


<!-- --- -->

SAVING PADDING

In [None]:
def pad_seq(X_train_seq, X_test_seq):
    """This function pads the sequences with max length of 200"""

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post", truncating="post")
    return X_train_pad, X_test_pad 


X_train_pad, X_test_pad = pad_seq(X_train_seq, X_test_seq)


In [57]:
np.save(os.path.join(ARTIFACTS_PATH, "X_train_pad.npy"), X_train_pad)
np.save(os.path.join(ARTIFACTS_PATH, "X_test_pad.npy"), X_test_pad)

print("✓ Padded sequences saved!")

✓ Padded sequences saved!


In [58]:
X_train_pad.shape, X_test_pad.shape

((1701, 200), (426, 200))

<!-- ----- -->