In [145]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipelines
from textblob import TextBlob

In [147]:
# read in data
df = pd.read_csv(r"C:\Users\USER\OneDrive\Documents\MY WORKS\CV\New-years-resolutions-DFE.csv", encoding='latin1')
df.head()

Unnamed: 0,other_topic,resolution_topics,gender,name,Resolution_Category,retweet_count,text,tweet_coord,tweet_created,tweet_date,tweet_id,tweet_location,tweet_state,user_timezone,tweet_region
0,"Read moore books, read less facebook.",Eat healthier,female,Dena_Marina,Health & Fitness,0.0,"#NewYearsResolution :: Read more books, No scr...",,12/31/14 10:48,12/31/14,5.50363e+17,Southern California,CA,Pacific Time (US & Canada),West
1,,Humor about Personal Growth and Interests Reso...,female,ninjagirl325,Humor,1.0,#NewYearsResolution Finally master @ZJ10 's pa...,,12/31/14 10:47,12/31/14,5.50363e+17,New Jersey,NJ,Central Time (US & Canada),Northeast
2,,Be More Confident,male,RickyDelReyy,Personal Growth,0.0,#NewYearsResolution to stop being so damn perf...,,12/31/14 10:46,12/31/14,5.50362e+17,Hollywood,CA,Eastern Time (US & Canada),West
3,Help More\nspread pet therapy|helping other,Other,male,CalmareNJ,Philanthropic,0.0,My #NewYearsResolution is to help my disabled ...,,12/31/14 10:45,12/31/14,5.50362e+17,Metro NYC,NY,,Northeast
4,,Be more positive,female,welovatoyoudemi,Personal Growth,0.0,#NewYearsResolution #2015Goals #2015bucketlist...,,12/31/14 10:44,12/31/14,5.50362e+17,"Pittsburgh, Pennsylvania",PA,Eastern Time (US & Canada),Northeast


In [149]:
# for stopwords.......... import library

import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [151]:
def preprocess(text):
    text = text.lower()
    # remove urls, mentions, hashtags, and non-letters
    text = re.sub(r"http\S+|@\S+|#\S+|[^a-z\s]", '', text)
    # remove 'rt' at the start
    text = re.sub(r'^rt[\s:]+', '', text)
    # tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [153]:
#  create sentiment column
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return "positive"
    elif polarity == 0:
        return "neutral"
    else:
        return "negative"

df['sentiment'] = df['text'].apply(get_sentiment)

In [154]:
# creating new dataframe
df_cleaned = df[['text', 'sentiment']].copy()
df_cleaned['clean_text'] = df_cleaned['text'].apply(preprocess)


In [157]:
df.head()

Unnamed: 0,other_topic,resolution_topics,gender,name,Resolution_Category,retweet_count,text,tweet_coord,tweet_created,tweet_date,tweet_id,tweet_location,tweet_state,user_timezone,tweet_region,sentiment
0,"Read moore books, read less facebook.",Eat healthier,female,Dena_Marina,Health & Fitness,0.0,"#NewYearsResolution :: Read more books, No scr...",,12/31/14 10:48,12/31/14,5.50363e+17,Southern California,CA,Pacific Time (US & Canada),West,positive
1,,Humor about Personal Growth and Interests Reso...,female,ninjagirl325,Humor,1.0,#NewYearsResolution Finally master @ZJ10 's pa...,,12/31/14 10:47,12/31/14,5.50363e+17,New Jersey,NJ,Central Time (US & Canada),Northeast,neutral
2,,Be More Confident,male,RickyDelReyy,Personal Growth,0.0,#NewYearsResolution to stop being so damn perf...,,12/31/14 10:46,12/31/14,5.50362e+17,Hollywood,CA,Eastern Time (US & Canada),West,neutral
3,Help More\nspread pet therapy|helping other,Other,male,CalmareNJ,Philanthropic,0.0,My #NewYearsResolution is to help my disabled ...,,12/31/14 10:45,12/31/14,5.50362e+17,Metro NYC,NY,,Northeast,positive
4,,Be more positive,female,welovatoyoudemi,Personal Growth,0.0,#NewYearsResolution #2015Goals #2015bucketlist...,,12/31/14 10:44,12/31/14,5.50362e+17,"Pittsburgh, Pennsylvania",PA,Eastern Time (US & Canada),Northeast,neutral


In [159]:
df.head()

Unnamed: 0,other_topic,resolution_topics,gender,name,Resolution_Category,retweet_count,text,tweet_coord,tweet_created,tweet_date,tweet_id,tweet_location,tweet_state,user_timezone,tweet_region,sentiment
0,"Read moore books, read less facebook.",Eat healthier,female,Dena_Marina,Health & Fitness,0.0,"#NewYearsResolution :: Read more books, No scr...",,12/31/14 10:48,12/31/14,5.50363e+17,Southern California,CA,Pacific Time (US & Canada),West,positive
1,,Humor about Personal Growth and Interests Reso...,female,ninjagirl325,Humor,1.0,#NewYearsResolution Finally master @ZJ10 's pa...,,12/31/14 10:47,12/31/14,5.50363e+17,New Jersey,NJ,Central Time (US & Canada),Northeast,neutral
2,,Be More Confident,male,RickyDelReyy,Personal Growth,0.0,#NewYearsResolution to stop being so damn perf...,,12/31/14 10:46,12/31/14,5.50362e+17,Hollywood,CA,Eastern Time (US & Canada),West,neutral
3,Help More\nspread pet therapy|helping other,Other,male,CalmareNJ,Philanthropic,0.0,My #NewYearsResolution is to help my disabled ...,,12/31/14 10:45,12/31/14,5.50362e+17,Metro NYC,NY,,Northeast,positive
4,,Be more positive,female,welovatoyoudemi,Personal Growth,0.0,#NewYearsResolution #2015Goals #2015bucketlist...,,12/31/14 10:44,12/31/14,5.50362e+17,"Pittsburgh, Pennsylvania",PA,Eastern Time (US & Canada),Northeast,neutral


In [161]:
# Create a new dataframe with only the 3 columns
df_cleaned = df[['text', 'sentiment']].copy()
df_cleaned['clean_text'] = df['text'].apply(preprocess)

# Preview
print(df_cleaned.head())


                                                text sentiment  \
0  #NewYearsResolution :: Read more books, No scr...  positive   
1  #NewYearsResolution Finally master @ZJ10 's pa...   neutral   
2  #NewYearsResolution to stop being so damn perf...   neutral   
3  My #NewYearsResolution is to help my disabled ...  positive   
4  #NewYearsResolution #2015Goals #2015bucketlist...   neutral   

                                          clean_text  
0  read books scrolling fbchecking email b breakf...  
1                   finally master part kitchen sink  
2                                     stop damn perf  
3  help disabled patients discover emotional phys...  
4                                          continued  


In [163]:
df['sentiment'].value_counts()

sentiment
positive    2936
neutral     1352
negative     723
Name: count, dtype: int64

In [165]:
df_cleaned.head()

Unnamed: 0,text,sentiment,clean_text
0,"#NewYearsResolution :: Read more books, No scr...",positive,read books scrolling fbchecking email b breakf...
1,#NewYearsResolution Finally master @ZJ10 's pa...,neutral,finally master part kitchen sink
2,#NewYearsResolution to stop being so damn perf...,neutral,stop damn perf
3,My #NewYearsResolution is to help my disabled ...,positive,help disabled patients discover emotional phys...
4,#NewYearsResolution #2015Goals #2015bucketlist...,neutral,continued


In [167]:
df_cleaned.replace({'sentiment': {'positive': 2, 'negative': 1, 'neutral': 0}}, inplace=True)

  df_cleaned.replace({'sentiment': {'positive': 2, 'negative': 1, 'neutral': 0}}, inplace=True)


In [169]:
df_cleaned.head()

Unnamed: 0,text,sentiment,clean_text
0,"#NewYearsResolution :: Read more books, No scr...",2,read books scrolling fbchecking email b breakf...
1,#NewYearsResolution Finally master @ZJ10 's pa...,0,finally master part kitchen sink
2,#NewYearsResolution to stop being so damn perf...,0,stop damn perf
3,My #NewYearsResolution is to help my disabled ...,2,help disabled patients discover emotional phys...
4,#NewYearsResolution #2015Goals #2015bucketlist...,0,continued


In [171]:
#split data into training data and test data
train_data, test_data = train_test_split(df_cleaned, test_size=0.2, random_state=42)

In [173]:
print(train_data.shape)
print(test_data.shape)

(4008, 3)
(1003, 3)


In [243]:
# Tokenize test data
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=50)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=50)

In [245]:
print(X_train)

[[   0    0    0 ...  130 2802  106]
 [   0    0    0 ...   17   19 2804]
 [   0    0    0 ...  405   28  208]
 ...
 [   0    0    0 ...  308   96   22]
 [   0    0    0 ...   26  118   91]
 [   0    0    0 ...   29 1073 8600]]


In [247]:
print(X_test)

[[   0    0    0 ...  300   24  617]
 [   0    0    0 ...   72  222   77]
 [   0    0    0 ...   40 2460    1]
 ...
 [   0    0    0 ...    3  330    1]
 [   0    0    0 ...   21   17   19]
 [   0    0    0 ...  123   99  182]]


In [249]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [251]:
print(Y_train)

1973    2
3666    0
831     1
239     1
4692    2
       ..
4426    2
466     0
3092    2
3772    2
860     0
Name: sentiment, Length: 4008, dtype: int64


In [269]:
# LSTM - Long Short-Term Memory

# build the model
model = Sequential()
model.add(Embedding(input_dim=50000, output_dim=128))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [271]:
# model summary
model.build(input_shape=(None, 50))
model.summary()

In [301]:
#compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [277]:
# training the model
# early stop to avoid over-fitting

early_stop = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stop])

Epoch 1/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 162ms/step - accuracy: 0.8693 - loss: 0.3727 - val_accuracy: 0.7793 - val_loss: 0.6069
Epoch 2/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 152ms/step - accuracy: 0.9351 - loss: 0.2037 - val_accuracy: 0.7830 - val_loss: 0.6389
Epoch 3/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 174ms/step - accuracy: 0.9660 - loss: 0.1126 - val_accuracy: 0.7918 - val_loss: 0.6049
Epoch 4/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 188ms/step - accuracy: 0.9828 - loss: 0.0610 - val_accuracy: 0.8067 - val_loss: 0.6783
Epoch 5/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 187ms/step - accuracy: 0.9869 - loss: 0.0441 - val_accuracy: 0.8092 - val_loss: 0.7450


<keras.src.callbacks.history.History at 0x1623d064770>

In [307]:
# load tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [309]:
# model Evaluation
loss, accuracy = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.7996 - loss: 0.6076
Test Loss: 0.6075773239135742
Test Accuracy: 0.7996011972427368


In [315]:
# save the trained model

from tensorflow.keras.models import load_model

model = load_model('sentiment_model.h5')




In [321]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [317]:
df_cleaned.head()

Unnamed: 0,text,sentiment,clean_text
0,"#NewYearsResolution :: Read more books, No scr...",2,read books scrolling fbchecking email b breakf...
1,#NewYearsResolution Finally master @ZJ10 's pa...,0,finally master part kitchen sink
2,#NewYearsResolution to stop being so damn perf...,0,stop damn perf
3,My #NewYearsResolution is to help my disabled ...,2,help disabled patients discover emotional phys...
4,#NewYearsResolution #2015Goals #2015bucketlist...,0,continued


In [327]:
# predict and evaluate using confusion matrix
#Predict probabilities for each class
y_pred_probs = model.predict(X_test)

#Get predicted class indices (highest probability)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

#Compute confusion matrix
cm = confusion_matrix(Y_test, y_pred_classes)
print("Confusion Matrix:\n", cm)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
Confusion Matrix:
 [[217  14  37]
 [ 41  76  41]
 [ 40  28 509]]


In [363]:
# Performance metrics
target_names = ['Negative', 'Neutral', 'Positive']
report = classification_report(Y_test, y_pred_classes, target_names=target_names)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    Negative       0.73      0.81      0.77       268
     Neutral       0.64      0.48      0.55       158
    Positive       0.87      0.88      0.87       577

    accuracy                           0.80      1003
   macro avg       0.75      0.72      0.73      1003
weighted avg       0.79      0.80      0.79      1003



In [331]:
# Building a Predictive System
def predict_sentiment(review):
    # tokenize and pad the review
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=50)  
    prediction = model.predict(padded_sequence)
    class_idx = prediction.argmax(axis=-1)[0]

    if class_idx == 0:
        sentiment = 'negative'
    elif class_idx == 1:
        sentiment = 'neutral'
    else:
        sentiment = 'positive'
    return sentiment

In [353]:
# example usage
tweet = 'What a new year i will work hard.'
sentiment = predict_sentiment(tweet)
print(f'The sentiment of the tweet is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
The sentiment of the tweet is: positive


In [347]:
tweet = 'I want to be lazy and spend money lavishly without working'
sentiment = predict_sentiment(tweet)
print(f'The sentiment of the review is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
The sentiment of the review is: negative


In [339]:
tweet = 'I want to study in the new year.'
sentiment = predict_sentiment(tweet)
print(f'The sentiment of the tweet is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
The sentiment of the review is: positive


In [351]:
tweet = 'No overspending this new year, I will do a lot of savings.'
sentiment = predict_sentiment(tweet)
print(f'The sentiment of the tweet is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
The sentiment of the review is: positive


In [361]:
tweet = 'I might set some goals for the new year and work on it.'
sentiment = predict_sentiment(tweet)
print(f'The sentiment of the tweet is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
The sentiment of the tweet is: positive
