In [8]:
import numpy as np
import pandas as pd
import re
from string import digits

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Dropout, Input, Bidirectional
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical

In [9]:
maxlen = 50

In [10]:
def read_data():
    df = pd.read_csv("/content/sample_data/Copy of Sentiment.csv", encoding='latin1')
    return df

In [11]:
df = read_data()
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [12]:
def preprocess(df):
    # convert source and target text to Lowercase
    df.text = df.text.astype(str).str.lower().apply(lambda x: x.lower())

    # creating a space between a word and the punctuation following it
    df.text = df.text.apply(lambda x: re.sub(r"([?.!,¿])", r" \1 ", x))
    df.text = df.text.apply(lambda x: re.sub(r'[" "]+', " ", x))

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    df.text = df.text.apply(lambda x: re.sub(r"[^a-zA-Z?.!,¿]+", " ", x))

    # Remove digits from source and target sentences
    num_digits = str.maketrans('', '', digits)
    df.text = df.text.apply(lambda x: x.translate(num_digits))

    # Remove extra spaces
    df.text = df.text.apply(lambda x: x.strip())

    df.text = df.text.apply(lambda x: re.sub(" +", " ", x))

    return df

In [13]:
df = preprocess(df)
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"i d have responded , if i were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,sooo sad i will miss you here in san diego ! ! !,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me . . .,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview ! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"sons of , why couldn t they put them on the re...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [14]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
df['sentiment']=lb.fit_transform(df['sentiment'])
print(df.shape)

(27481, 10)


In [15]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"i d have responded , if i were going","I`d have responded, if I were going",0,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,sooo sad i will miss you here in san diego ! ! !,Sooo SAD,1,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me . . .,bullying me,1,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview ! leave me alone,leave me alone,1,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"sons of , why couldn t they put them on the re...","Sons of ****,",1,noon,60-70,Angola,32866272,1246700.0,26


In [16]:
def tokenization(data, maxlength = 100):
    token = Tokenizer(lower=True, oov_token='oov')
    token.fit_on_texts(data)

    data_seq = token.texts_to_sequences(data)
    data_pad = pad_sequences(data_seq, maxlen=maxlength, padding='post')

    return token, data_pad

In [17]:
token, X = tokenization(df['text'], maxlength=maxlen)

In [18]:
print(X.shape, df['text'].shape)

(27481, 50) (27481,)


In [19]:
vocab_size = len(token.word_index)
vocab_size

24621

In [20]:
reverse_word_index = {v: k for k, v in token.word_index.items()}

# Glove

In [21]:
# creating glove vectors
def get_glove_vector():
    glove_vectors = {}

    with open("/content/sample_data/glove.6B.200d[1].txt", "r", encoding="UTF-8") as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:])
            glove_vectors[word] = vectors
    return glove_vectors

In [22]:
glove_vectors = get_glove_vector()
total_words = len(glove_vectors.keys())
total_words

12180

In [23]:
emb_dim = 200


# create word vector matrix with glove vectors
def create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim):
    word_vector_matrix = np.zeros((vocab_size+1, emb_dim))

    count = 0
    for word, index in token.word_index.items():
        vector = glove_vectors.get(word)
        if vector is not None:
            word_vector_matrix[index] = vector
        else:
            count += 1
    print(f"Vector not found for {count} words")
    return word_vector_matrix

In [24]:
emb_matrix = create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim)

Vector not found for 17537 words


In [25]:
emb_matrix.shape

(24622, 200)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'].to_numpy(), test_size=0.33, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(18412, 50) (18412,) (9069, 50) (9069,)


In [27]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1,
                    output_dim=emb_dim,
                    input_length=maxlen,
                    weights=[emb_matrix],
                    trainable=True))
# model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()



In [28]:
model.fit(X_train, y_train, batch_size=64, epochs=1, validation_split=0.2)

[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 333ms/step - accuracy: 0.7412 - loss: 0.5391 - val_accuracy: 0.8146 - val_loss: 0.4250


<keras.src.callbacks.history.History at 0x7a717f3b7e10>

In [29]:
ans = model.predict(X_test)
print(ans.shape)
ans

[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 53ms/step
(9069, 1)


array([[0.18046626],
       [0.1108913 ],
       [0.03900233],
       ...,
       [0.06419168],
       [0.01329409],
       [0.21084784]], dtype=float32)