<a href="https://colab.research.google.com/github/ReynaldiJ/portfolio/blob/main/LSTM_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Preprocessing

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv("train_data.csv")
df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [None]:
df['label'].value_counts()

label
2     3545
18    2118
14    1822
9     1557
5      987
16     985
1      837
19     823
7      624
6      524
15     501
17     495
12     487
13     471
4      359
3      321
0      255
8      166
10      69
11      44
Name: count, dtype: int64

In [None]:
# Download NLTK resources for tokenization and stopwords
nltk.download('punkt')
nltk.download('stopwords')

def cleansing(df):
    # Convert text to lowercase
    df_clean = df.str.lower()

    # Remove special characters, punctuation, and numbers
    df_clean = [re.sub(r'http\S+|www.\S+', '', text) for text in df_clean]
    df_clean = [re.sub(r"\d+", "", text) for text in df_clean]
    df_clean = [re.sub(r'[^\w\s]', ' ', text) for text in df_clean]

    # Tokenization
    df_tokens = [word_tokenize(text) for text in df_clean]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df_filtered = [[word for word in tokens if word not in stop_words] for tokens in df_tokens]

    return df_filtered

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
processed_df=cleansing(df['text'])

In [None]:
print(df['text'][0])

Here are Thursday's biggest analyst calls: Apple, Amazon, Tesla, Palantir, DocuSign, Exxon &amp; more  https://t.co/QPN8Gwl7Uh


In [None]:
print(processed_df[0])

['thursday', 'biggest', 'analyst', 'calls', 'apple', 'amazon', 'tesla', 'palantir', 'docusign', 'exxon', 'amp']


In [None]:
df.insert(1, 'Clean Text', processed_df)

In [None]:
df.head()

Unnamed: 0,text,Clean Text,label
0,Here are Thursday's biggest analyst calls: App...,"[thursday, biggest, analyst, calls, apple, ama...",0
1,Buy Las Vegas Sands as travel to Singapore bui...,"[buy, las, vegas, sands, travel, singapore, bu...",0
2,"Piper Sandler downgrades DocuSign to sell, cit...","[piper, sandler, downgrades, docusign, sell, c...",0
3,"Analysts react to Tesla's latest earnings, bre...","[analysts, react, tesla, latest, earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,"[netflix, peers, set, return, growth, analysts...",0


In [None]:
max_tok = df['Clean Text'].str.len().max()

In [None]:
max_tok

48

#Vectorization

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Clean Text'], df['label'], test_size=0.3, random_state=44)

In [None]:
X_train.str.len().max()

48

In [None]:
from gensim.models import Word2Vec
#train
train_sg_model = Word2Vec(X_train, vector_size=100, window=5, sg=1, min_count=3)
train_sg_vocab = train_sg_model.wv.index_to_key

#test
test_sg_model = Word2Vec(X_test, vector_size=100, window=5, sg=1, min_count=3)
test_sg_vocab = test_sg_model.wv.index_to_key

In [None]:
train_vec_dict={}
for word in train_sg_vocab:
    train_vec_dict[word]=train_sg_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(train_vec_dict)) # should come equal to vocab size

The no of key-value pairs :  6992


In [None]:
train_vec_dict

{'new': array([-1.02866776e-01,  2.55120188e-01,  4.22799550e-02, -1.08810008e-01,
        -1.14281150e-02, -3.94387007e-01,  2.74960876e-01,  7.00043619e-01,
         6.18408173e-02, -6.29312098e-02,  6.10246435e-02,  4.53334628e-03,
        -1.59210071e-01,  1.80134848e-02, -7.19688013e-02, -1.15474992e-01,
        -1.02230266e-01, -2.99915552e-01, -2.21023053e-01, -4.04494643e-01,
        -1.76708978e-02,  1.10637933e-01,  1.24471828e-01, -2.17085076e-03,
         1.48150161e-01,  1.28946811e-01,  1.91226490e-02, -6.74674138e-02,
        -3.30378413e-01,  1.46606833e-01, -3.88233438e-02, -2.42288858e-02,
         4.00967747e-02, -2.20776767e-01, -1.66319355e-01, -9.27819535e-02,
        -5.53356186e-02, -3.44405055e-01, -9.85663831e-02, -4.77766037e-01,
        -2.61928767e-01,  2.24099219e-01, -1.87967658e-01, -4.21086103e-02,
        -4.56558503e-02, -1.17895678e-01, -2.15434074e-01,  1.95675790e-01,
         2.50205934e-01,  2.82882661e-01, -6.81125298e-02,  6.62406087e-02,
     

In [None]:
max_sen_len= max_tok # max lenght of word in a sentence
vocab_size =35000  #ideally it should be len(tok.word_index) + 1  or total no of words in data in this case = 4750, but to handle number of data which not appear in train, for example in test, make the size higher
embed_dim=100 # embedding dimension as choosen in word2vec constructor

In [None]:
from keras.preprocessing.text import one_hot,Tokenizer
tok = Tokenizer()
tok.fit_on_texts(X_train)
vocab_size = vocab_size
encd_rev = tok.texts_to_sequences(X_train)

In [None]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector=train_vec_dict.get(word) #mapping the vector to word in our skipgram dictionary
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

In [None]:
tok.fit_on_texts(X_test)
encd_rev_test = tok.texts_to_sequences(X_test)

# Prepare Embedding Layer

In [None]:
encd_rev

[[901, 3089, 4579, 4580, 5548, 539, 812, 2796, 192],
 [700,
  747,
  1220,
  1041,
  3907,
  9840,
  521,
  3090,
  95,
  94,
  540,
  1330,
  930,
  3457,
  725,
  286,
  51],
 [200, 1709, 593, 902, 1909, 880, 3458],
 [222, 5549, 5550, 339, 207, 3091, 2797, 340, 6993, 457, 701, 2367, 6994],
 [6995, 61, 9841, 473, 6995, 574, 193, 239, 9842, 669, 9843, 9844, 3459, 427],
 [553, 1710, 726, 634, 411, 3460, 212, 362, 857, 340, 1711],
 [4581, 2798, 1618, 18, 253, 2562, 327, 3908, 111, 42, 6996, 12],
 [1807, 3909, 222, 2797, 340, 458, 121, 792, 1221, 129, 33, 858, 56],
 [652, 529, 51, 530, 3461, 772, 4582, 459, 1464, 3092, 1712, 73],
 [9845, 195, 92, 176, 522, 1545, 3093, 1546, 2799, 2800, 490, 4, 16],
 [4583, 653, 3910, 89, 412, 150, 1081, 3462, 554, 72, 19, 23, 3],
 [9846,
  9847,
  117,
  9848,
  460,
  31,
  1465,
  4584,
  5551,
  575,
  3094,
  6997,
  1004,
  1222,
  727,
  428,
  9849],
 [9850, 1466, 4585, 9851, 3463, 18, 9852],
 [3911, 2368, 9, 2, 474],
 [6998, 2193, 773, 461, 1547, 

In [None]:
encd_rev_test

[[3212, 2297, 12107, 12108, 66, 1566, 2944],
 [12109, 7322, 852, 85, 389, 280, 876, 5996, 507, 2598, 560, 3482, 509],
 [133,
  50,
  6681,
  5244,
  5245,
  5909,
  79,
  14,
  492,
  1430,
  3954,
  5227,
  8012,
  1891,
  824,
  12110,
  1297,
  7,
  97],
 [6935, 1670, 215, 834, 533, 1943, 1773],
 [12111, 761, 712, 7302, 2763, 12112, 265, 53, 34, 174, 4868, 358],
 [548,
  3357,
  451,
  7719,
  1688,
  80,
  7720,
  127,
  3357,
  5451,
  898,
  1920,
  2215,
  905,
  1199,
  1619,
  414,
  1242],
 [1636,
  8516,
  12113,
  12114,
  2159,
  12115,
  12116,
  3901,
  12117,
  6123,
  10378,
  6744,
  434,
  4786,
  12118,
  901,
  1367,
  6407,
  4509,
  5622,
  9860,
  3736,
  12119,
  4806,
  1624,
  5558,
  1131,
  6774,
  9861,
  5195,
  12120,
  12121,
  5997,
  2412,
  134,
  3805,
  12122,
  12123,
  9207],
 [12124, 12125, 3423, 329, 130, 73, 8013, 45, 8013, 49],
 [12126, 7644, 5082, 16, 395, 497, 2441, 9630],
 [7083, 7909, 1567, 25, 10380, 10381, 1433, 316, 1198, 343, 4857, 14

In [None]:
vocab_size

35000

In [None]:
from keras.preprocessing.sequence import pad_sequences
# now padding to have a maximum length of 48
pad_rev= pad_sequences(encd_rev, maxlen=max_sen_len, padding='post')
pad_rev.shape   # note that we had 11893 data and we have padded each review to have  a lenght of 48 words.

(11893, 48)

In [None]:
pad_rev_test= pad_sequences(encd_rev_test, maxlen=max_sen_len, padding='post')
pad_rev_test.shape

(5097, 48)

In [None]:
pad_rev

array([[ 901, 3089, 4579, ...,    0,    0,    0],
       [ 700,  747, 1220, ...,    0,    0,    0],
       [ 200, 1709,  593, ...,    0,    0,    0],
       ...,
       [  88,  521,  254, ...,    0,    0,    0],
       [ 192,    1, 1195, ...,    0,    0,    0],
       [2517,  413, 2162, ...,    0,    0,    0]], dtype=int32)

In [None]:
pad_rev_test

array([[ 3212,  2297, 12107, ...,     0,     0,     0],
       [12109,  7322,   852, ...,     0,     0,     0],
       [  133,    50,  6681, ...,     0,     0,     0],
       ...,
       [  522,   522,  2871, ...,     0,     0,     0],
       [ 3756,  2853,    33, ...,     0,     0,     0],
       [ 1200,    95,    96, ...,     0,     0,     0]], dtype=int32)

In [None]:
print(pad_rev.shape, y_train.shape)
print(pad_rev_test.shape, y_test.shape)

(11893, 48) (11893,)
(5097, 48) (5097,)


# LSTM Classification Modelling

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import Constant
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, Activation, Flatten
from keras.models import Sequential

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_sen_len))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 48, 100)           3500000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 3617377 (13.80 MB)
Trainable params: 3617377 (13.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
model.fit(pad_rev, y_train, batch_size=2, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7d28e01759c0>

# Evaluation

In [None]:
loss, accuracy = model.evaluate(pad_rev_test, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.04787129536271095


In [None]:
test_predict=model.predict(pad_rev_test)
model_test=np.argmax(test_predict,axis=1)



In [None]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, model_test))


Classification Report

              precision    recall  f1-score   support

           0       0.01      1.00      0.03        73
           1       0.00      0.00      0.00       244
           2       0.00      0.00      0.00      1053
           3       0.00      0.00      0.00        98
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00       296
           6       0.00      0.00      0.00       153
           7       0.00      0.00      0.00       204
           8       0.00      0.00      0.00        48
           9       0.00      0.00      0.00       435
          10       0.00      0.00      0.00        19
          11       0.00      0.00      0.00        15
          12       0.00      0.00      0.00       157
          13       0.00      0.00      0.00       134
          14       0.00      0.00      0.00       560
          15       0.00      0.00      0.00       154
          16       0.00      0.00      0.00       310
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classification report indicates that the current model's performance is very poor across all metrics. The key reasons for this poor performance are an imbalanced dataset and data quality issues. The number of instances for each class is highly uneven, which leads to bias towards majority classes and results in poor performance on minority classes. Additionally, ineffective preprocessing and handling of text data can lead to suboptimal input features, while inconsistent or noisy data can further degrade the model's performance.