In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [22]:

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [23]:
#for DEP instructor place all the files in the same directory as the notebook

# Load and preprocess data
train_df = pd.read_csv('twitter_training.csv', header=None, names=['id', 'topic', 'sentiment', 'text'])
validation_df = pd.read_csv('twitter_validation.csv', header=None, names=['id', 'topic', 'sentiment', 'text'])

train_df.head(10)

Unnamed: 0,id,topic,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [24]:
validation_df.head(10)

Unnamed: 0,id,topic,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
5,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
6,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...
7,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: ..."
8,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...
9,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the wor...


In [25]:
#combining
df = pd.concat([train_df, validation_df], ignore_index=True)
df.head(11)

Unnamed: 0,id,topic,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [26]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
ps

<PorterStemmer>

In [27]:
# Data cleaning

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)
# using apply to call the function
df['processed_text'] = df['text'].apply(preprocess_text)
borderlands_df = df[df['topic'] == 'Borderlands']

In [28]:
# Prepare features and labels
X = borderlands_df['processed_text']
y = borderlands_df['sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:

# Vectorize the text
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_test_vectorized
X_train_vectorized

<1855x2988 sparse matrix of type '<class 'numpy.int64'>'
	with 21421 stored elements in Compressed Sparse Row format>

In [30]:

# Train and evaluate Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)
nb_pred = nb_model.predict(X_test_vectorized)
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_report = classification_report(y_test, nb_pred, output_dict=True)
nb_report

{'Irrelevant': {'precision': 1.0,
  'recall': 0.9534883720930233,
  'f1-score': 0.9761904761904762,
  'support': 43.0},
 'Negative': {'precision': 0.9743589743589743,
  'recall': 0.9156626506024096,
  'f1-score': 0.9440993788819876,
  'support': 83.0},
 'Neutral': {'precision': 0.9545454545454546,
  'recall': 0.8467741935483871,
  'f1-score': 0.8974358974358975,
  'support': 124.0},
 'Positive': {'precision': 0.8936170212765957,
  'recall': 0.9813084112149533,
  'f1-score': 0.9354120267260579,
  'support': 214.0},
 'accuracy': 0.9310344827586207,
 'macro avg': {'precision': 0.9556303625452562,
  'recall': 0.9243084068646934,
  'f1-score': 0.9382844448086047,
  'support': 464.0},
 'weighted avg': {'precision': 0.9342014521306524,
  'recall': 0.9310344827586207,
  'f1-score': 0.9305962584586706,
  'support': 464.0}}

In [31]:
# Train and evaluate Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_vectorized, y_train)
lr_pred = lr_model.predict(X_test_vectorized)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_report = classification_report(y_test, lr_pred, output_dict=True)
lr_report

{'Irrelevant': {'precision': 0.9761904761904762,
  'recall': 0.9534883720930233,
  'f1-score': 0.9647058823529412,
  'support': 43.0},
 'Negative': {'precision': 0.9868421052631579,
  'recall': 0.9036144578313253,
  'f1-score': 0.9433962264150944,
  'support': 83.0},
 'Neutral': {'precision': 0.9824561403508771,
  'recall': 0.9032258064516129,
  'f1-score': 0.9411764705882353,
  'support': 124.0},
 'Positive': {'precision': 0.9137931034482759,
  'recall': 0.9906542056074766,
  'f1-score': 0.9506726457399103,
  'support': 214.0},
 'accuracy': 0.9482758620689655,
 'macro avg': {'precision': 0.9648204563131968,
  'recall': 0.9377457104958595,
  'f1-score': 0.9499878062740452,
  'support': 464.0},
 'weighted avg': {'precision': 0.950992178350156,
  'recall': 0.9482758620689655,
  'f1-score': 0.9481337678338605,
  'support': 464.0}}

In [32]:

# Print model performance
print("\n--- Model Performance ---")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f} or {nb_accuracy*100:.2f}%")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f} or {lr_accuracy*100:.2f}% \n\n")



--- Model Performance ---
Naive Bayes Accuracy: 0.9310 or 93.10%
Logistic Regression Accuracy: 0.9483 or 94.83% 




In [33]:
# Use Logistic Regression for sentiment prediction on all data
X_all_vectorized = vectorizer.transform(borderlands_df['processed_text'])
borderlands_df.loc[:, 'lr_sentiment'] = lr_model.predict(X_all_vectorized)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [34]:

# Visualize sentiment distribution
fig = make_subplots(rows=1, cols=2, subplot_titles=('Original Sentiment Distribution', 'Logistic Regression Predicted Sentiment'))

for i, col in enumerate(['sentiment', 'lr_sentiment'], start=1):
    counts = borderlands_df[col].value_counts().sort_index()
    fig.add_trace(
        go.Bar(x=counts.index, y=counts.values, name=col),
        row=1, col=i
    )
    fig.update_xaxes(title_text="Sentiment", row=1, col=i)
    fig.update_yaxes(title_text="Count", row=1, col=i)

fig.update_layout(height=500, width=1000, title_text=" Sentiment Distribution Comparison")
fig.show()

In [35]:

# Visualize model performance
metrics = ['precision', 'recall', 'f1-score']
sentiments = ['Positive', 'Negative', 'Neutral']

fig = make_subplots(rows=1, cols=3, subplot_titles=metrics)

for i, metric in enumerate(metrics, start=1):
    nb_values = [nb_report[sentiment][metric] for sentiment in sentiments]
    lr_values = [lr_report[sentiment][metric] for sentiment in sentiments]

    fig.add_trace(
        go.Bar(x=sentiments, y=nb_values, name='Naive Bayes'),
        row=1, col=i
    )
    fig.add_trace(
        go.Bar(x=sentiments, y=lr_values, name='Logistic Regression'),
        row=1, col=i
    )

    fig.update_xaxes(title_text="Sentiment", row=1, col=i)
    fig.update_yaxes(title_text=metric.capitalize(), row=1, col=i)

fig.update_layout(height=500, width=1200, title_text="Model Performance Comparison", barmode='group')
fig.show()