In [2]:
# Sentiment Analysis using Twitter Dataset
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud,STOPWORDS

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SOBIKUL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SOBIKUL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
load = pd.read_csv('twitter_training.csv', header=None, encoding='utf-8').sample(70000)
load.columns = ['id', 'entity', 'sentiment', 'text']

load = load[load['sentiment'].isin(['Positive', 'Negative', 'Neutral'])]
df = load[['sentiment', 'text']]


In [4]:
print(df.isna().sum())
print(df.info())
print(f"Before duplicate value: {df.duplicated().sum()}")
df.head(6)

sentiment      0
text         530
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 57808 entries, 192 to 32288
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  57808 non-null  object
 1   text       57278 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB
None
Before duplicate value: 3794


Unnamed: 0,sentiment,text
192,Neutral,i enter that gunner seat and i fear for my life
18561,Neutral,"Everyone’s wondering bc it “looks for Xbox”, b..."
56666,Neutral,Next. There was an amazing rainbow... Miss all...
11226,Positive,@TheGreat_Keeno they’re here . I can’t wait
37019,Positive,Its game day for Microsoft and.. 7 years of wa...
37558,Neutral,Wow needs a huge spectate button for just like...


In [5]:
# Check the duplicate value & reomve the duplicate value 
before = load.shape[0]
load = df.drop_duplicates(subset=['text', 'sentiment'])
after = load.shape[0]

print(f"Before duplicate value: {before} rows")
print(f"After remove duplicate value: {after} rows")
print(f"total reomve duplicate value : {before - after} values")

Before duplicate value: 57808 rows
After remove duplicate value: 54014 rows
total reomve duplicate value : 3794 values


In [6]:
#fill the text NaN value in the dataset 
df['text'] =df['text'].fillna('unknown')


In [7]:
from wordcloud import STOPWORDS
import plotly.graph_objects as go
import numpy as np
from collections import Counter
positive_texts = list(df[df["sentiment"] == "Positive"]["text"])
def plot_3d_wordcloud(text_list, title="3D WordCloud", top_n=20, color='orange', seed=42):
    """
    text_list: list of text data (e.g., from df['text'])
    title: plot title
    top_n: number of top frequent words to show
    color: word color
    seed: random seed for reproducibility
    """
    all_words = " ".join(text_list).lower().split()

    # stopwords 
    filtered_words = [word for word in all_words if word not in STOPWORDS]

    word_counts = Counter(filtered_words).most_common(top_n)
    if not word_counts:
        print(" No valid words to display.")
        return

    words, freqs = zip(*word_counts)

    np.random.seed(seed)
    x, y, z = np.random.rand(3, len(words))

    max_freq = max(freqs)
    sizes = [20 + (f / max_freq) * 30 for f in freqs]

    # 3D Plot 
    fig = go.Figure()

    for i, word in enumerate(words):
        fig.add_trace(go.Scatter3d(
            x=[x[i]], y=[y[i]], z=[z[i]],
            mode='text',
            text=[word],
            textfont=dict(size=sizes[i], color=color)
        ))

    fig.update_layout(
        title=title,
        scene=dict(
            xaxis=dict(showbackground=False),
            yaxis=dict(showbackground=False),
            zaxis=dict(showbackground=False)
        ),
        margin=dict(l=0, r=0, t=40, b=0)
    )

    fig.show()
plot_3d_wordcloud(positive_texts, title="3D WordCloud for Positive Reviews")

In [8]:

Negative_texts = list(df[df["sentiment"] == "Negative"]["text"])
plot_3d_wordcloud(Negative_texts, title="3D WordCloud for Negative Reviews")

In [9]:
Neutral_texts = list(df[df["sentiment"] == "Neutral"]["text"])
plot_3d_wordcloud(Neutral_texts, title="3D WordCloud for Neutral Reviews")

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())  # Lemmatize
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [11]:
df.head(10)

Unnamed: 0,sentiment,text,clean_text
192,Neutral,i enter that gunner seat and i fear for my life,enter gunner seat fear life
18561,Neutral,"Everyone’s wondering bc it “looks for Xbox”, b...",everyones wondering bc look xbox nobody mentio...
56666,Neutral,Next. There was an amazing rainbow... Miss all...,next amazing rainbow miss
11226,Positive,@TheGreat_Keeno they’re here . I can’t wait,theyre cant wait
37019,Positive,Its game day for Microsoft and.. 7 years of wa...,game day microsoft 7 year waiting entire gener...
37558,Neutral,Wow needs a huge spectate button for just like...,wow need huge spectate button like hearthstone...
18767,Neutral,I just earned an [Halls Against Devotion] Achi...,earned hall devotion achievement
11788,Negative,Bruh @Ronnie2K it @NBA2K who the... fuck car i...,bruh fuck car
13387,Negative,@ NBA2K y'all a corporate joke. Couldn't make ...,nba2k yall corporate joke couldnt make decent ...
47472,Positive,I love that my Home Depot leadership is being ...,love home depot leadership exemplar quote expr...


In [12]:
df["sentiment_encoded"] = df["sentiment"].replace({
    "Positive":0,
    "Negative":1,
    "Neutral":2
})

In [13]:
df.head()

Unnamed: 0,sentiment,text,clean_text,sentiment_encoded
192,Neutral,i enter that gunner seat and i fear for my life,enter gunner seat fear life,2
18561,Neutral,"Everyone’s wondering bc it “looks for Xbox”, b...",everyones wondering bc look xbox nobody mentio...,2
56666,Neutral,Next. There was an amazing rainbow... Miss all...,next amazing rainbow miss,2
11226,Positive,@TheGreat_Keeno they’re here . I can’t wait,theyre cant wait,0
37019,Positive,Its game day for Microsoft and.. 7 years of wa...,game day microsoft 7 year waiting entire gener...,0


In [14]:
x = df["clean_text"]
y = df["sentiment_encoded"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
# print("x_train shap",x_train.shape)
# print("y_train:",y_train.shape)
# print("y_test:",y_test.shape)
# print("x_test:",x_test.shape)

In [16]:
# print(df["sentiment_encoded"].value_counts(normalize=True) * 100)


In [17]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=40000,min_df=3,max_df=0.85,stop_words="english")
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [18]:
# print("x_train-tfidf",x_train_tfidf.shape)
# print("x_test_tfidf:",x_test_tfidf.shape)

In [19]:
# print("x_train+tf",x_train_tfidf)

In [20]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42,sampling_strategy="auto",k_neighbors=5,n_jobs=1)
x_train_sm,y_train_sm = smote.fit_resample(x_train_tfidf,y_train)

In [21]:

model = SVC()
para_distib = {
    "kernel": ['linear', 'rbf', 'poly', 'sigmoid'],
    "C": [0.01, 0.1, 1, 2, 5],
    "gamma": ['scale', 'auto', 0.01, 0.1, 1],
    "degree": [2, 3, 4],  # Only used if kernel='poly'
    "class_weight": ['balanced', None],
    "max_iter": [2000, 5000],
    "shrinking": [True, False],
    "probability": [True]  # Enables probability estimates
    }

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=para_distib,
    n_iter=10,              
    cv=3,                   
    scoring='accuracy',     
    verbose=1,
    n_jobs=-1,             
    random_state=42
)
random_search.fit(x_train_sm, y_train_sm)
y_pred = random_search.predict(x_test_tfidf)
print("\n Accuracy on test set:", accuracy_score(y_test, y_pred))
print("\n Classification Report:")
print(classification_report(y_test, y_pred))
print("confusion Matrix:")
print(confusion_matrix(y_test,y_pred))



Fitting 3 folds for each of 10 candidates, totalling 30 fits

 Accuracy on test set: 0.893011589690365

 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      3894
           1       0.97      0.86      0.91      4181
           2       0.78      0.97      0.86      3487

    accuracy                           0.89     11562
   macro avg       0.90      0.90      0.89     11562
weighted avg       0.91      0.89      0.89     11562

confusion Matrix:
[[3336   73  485]
 [  90 3615  476]
 [  73   40 3374]]


In [22]:
import pickle

with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(random_search.best_estimator_, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
