In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramzy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramzy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
df = pd.read_csv('sentiment.csv', encoding='latin-1', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [37]:
texts = df[5].values 
labels = df[0].apply(lambda x: 1 if x == 4 else 0).values 

In [38]:
stop_words = set(stopwords.words('english'))

In [39]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  
    text = re.sub(r'\@\w+|\#', '', text)  
    text = re.sub(r'[^A-Za-z\s]', '', text) 
    tokens = word_tokenize(text)  
    filtered_words = [word for word in tokens if word not in stop_words] 
    return ' '.join(filtered_words)

In [40]:
cleaned_texts = [clean_text(text) for text in texts]

In [58]:
cleaned_texts[0]

'thats bummer shoulda got david carr third day'

In [42]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(cleaned_texts)
sequences = tokenizer.texts_to_sequences(cleaned_texts)

In [45]:
import numpy as np
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [63]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=maxlen), 
    Dropout(0.5),
    LSTM(64, return_sequences=False),  
    Dense(1, activation="sigmoid")  
])

In [64]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [65]:
model.build(input_shape=(None, maxlen))

In [66]:
model.summary()

In [67]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/3
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1947s[0m 49ms/step - accuracy: 0.7682 - loss: 0.4788 - val_accuracy: 0.7921 - val_loss: 0.4411
Epoch 2/3
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2066s[0m 52ms/step - accuracy: 0.7961 - loss: 0.4360 - val_accuracy: 0.7952 - val_loss: 0.4378
Epoch 3/3
[1m40000/40000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2016s[0m 50ms/step - accuracy: 0.8012 - loss: 0.4277 - val_accuracy: 0.7964 - val_loss: 0.4375


<keras.src.callbacks.history.History at 0x219194447a0>

In [69]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 14ms/step - accuracy: 0.7969 - loss: 0.4363
Validation Loss: 0.4374820291996002
Validation Accuracy: 0.7963906526565552


In [71]:
model.save('sentiment_analysis.h5')



# testing model

In [83]:
word_index = tokenizer.word_index

In [85]:
from tensorflow import keras 
def predict_sentiment(text, word_index, model):
    from tensorflow.keras.preprocessing.text import text_to_word_sequence

    words = text_to_word_sequence(text)  
    sequence = [[word_index.get(word, 2) for word in words]]  
    sequence = pad_sequences(sequence, maxlen=maxlen)  
    prediction = model.predict(sequence)[0][0]
    
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment


In [86]:
print(predict_sentiment("This  was fantastic!", word_index, model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Positive


In [89]:
print(predict_sentiment("This was terrible!", word_index, model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Negative
