In [17]:
import pandas as pd
import numpy as np
import pickle
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("../data/processed/sentiment_dataset.csv", index_col=0)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Text,Sentiment
0,the internet on my laptop works jeesh the webs...,1.0
1,I paid $130 for this game. I got all the colle...,-1.0
2,I bought this mic back in March 2015. I've bee...,-1.0
3,"more work to doooooo HAHA yeah, im happy",1.0
4,@gerard_k no sorry. long day today and tomorro...,-1.0


In [3]:
def cleanText(text):
    if type(text) == np.float64:
        return ""
    else:
        text = str(text).lower()
        text = re.sub("'", "", text) # to avoid removing contractions in english
        text = re.sub("@[A-Za-z0-9_]+","", text)
        text = re.sub("#[A-Za-z0-9_]+","", text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub('[()!?]', ' ', text)
        text = re.sub('\[.*?\]',' ', text)
        text = re.sub("[^a-z0-9]"," ", text)
        return text

In [4]:
df['ProcessedText'] = df['Text'].apply(cleanText)

In [5]:
VOCAB_SIZE = 10000
MAX_LEN = 500
EMBEDDING_DIM = 64

In [6]:
texts = df['ProcessedText'].values
sentiments = df['Sentiment'].values

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, value=VOCAB_SIZE-1, padding='post')

In [8]:
with open('../src/models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
from tensorflow.keras.utils import to_categorical

encoded_sentiments = to_categorical(sentiments, num_classes=3)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, encoded_sentiments, test_size=0.15, random_state=42)

In [18]:
model = Sequential([
        Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
        Dense(32, activation='relu'),
        Flatten(),
        Dense(3, activation='softmax')  # 3 classes: negative, neutral, positive
    ])

In [22]:
model.compile(optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['categorical_accuracy'])

In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 64)           640000    
                                                                 
 dense_3 (Dense)             (None, 500, 32)           2080      
                                                                 
 flatten (Flatten)           (None, 16000)             0         
                                                                 
 dense_4 (Dense)             (None, 3)                 48003     
                                                                 
Total params: 690083 (2.63 MB)
Trainable params: 690083 (2.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.15)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2021f38c150>

In [25]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

Test Loss: 0.4621, Test Accuracy: 0.8095


In [26]:
model.save('../src/models/sentiment_analysis_model.h5')

  saving_api.save_model(
