In [None]:

import kagglehub
kazanova_sentiment140_path = kagglehub.dataset_download('kazanova/sentiment140')

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from IPython.display import display

In [None]:
data = pd.read_csv(r"/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding="latin1", header=None)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
columns = ["sentiment", "ids", "date", "flag", "user", "tweet"]
data.columns= columns
data.head()

In [None]:
data.info()

In [None]:
df = data[["tweet", "sentiment"]].copy()
df.head()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import STOPWORDS

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.update(["amp", "rt", "lt", "gt"])

stemmer = PorterStemmer()

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"https?://\S+", "", tweet)
    tweet = re.sub(r"@\w+|#", "", tweet)
    tweet = re.sub(r"[^\w\s]|[\d]", "", tweet)
    tweet = " ".join([stemmer.stem(word) for word in tweet.split() if word not in stop_words])
    return tweet

df["clean_tweet"] = df["tweet"].apply(clean_tweet)
df.head()

In [None]:
df["sentiment"] = df["sentiment"].replace({0: "Negative", 4: "Positive"})
df.head()

In [None]:
import matplotlib.pyplot as plt

df["sentiment"].value_counts().plot(kind="pie",
                                    autopct='%1.1f%%',
                                    pctdistance=0.85,
                                    startangle=90,
                                    colors=["lightcoral", "lightgreen"],
                                    wedgeprops={'edgecolor': 'black'})

plt.title('Sentiment Distribution')
plt.axis('equal')
plt.legend(labels=df["sentiment"].value_counts().index,
           loc='upper right', fontsize=9)
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

positive_words = " ".join(df[df["sentiment"] == "Positive"]["clean_tweet"])
negative_words = " ".join(df[df["sentiment"] == "Negative"]["clean_tweet"])

wordcloud = WordCloud(width=800, height=400, background_color="white").generate(positive_words)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Positive Tweets")
plt.show()

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(negative_words)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Negative Tweets")
plt.show()

In [None]:
df["tweet_length"] = df["clean_tweet"].apply(lambda x: len(x.split()))
df.head()

In [None]:
zero_len = df[df["tweet_length"]==0][["tweet" ,"clean_tweet","tweet_length"]]
display(zero_len.shape)
zero_len.head()

In [None]:
df = df[df["tweet_length"] != 0].reset_index(drop = True)
df = df.drop_duplicates(subset=['clean_tweet'], keep='first')
display(df.shape)

In [None]:
tweet_len = df.groupby(["sentiment"]).agg(
   mean = ("tweet_length", "mean")
)

tweet_len = tweet_len.transpose()
tweet_len

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))


df[df["sentiment"]=="Negative"]["tweet_length"].hist(color="lightcoral", edgecolor="black", ax=axes[0, 0], bins=30)
df[df["sentiment"]=="Positive"]["tweet_length"].hist(color="lightgreen", edgecolor="black", ax=axes[0, 1], bins=30)
tweet_len.plot(kind="bar", color = ("lightcoral", "lightgreen"), ax = axes[1, 0] )

axes[0, 0].set_title("Negative Tweet Length Distribution")
axes[0, 1].set_title("Positive Tweet Length Distribution")
axes[1, 0].set_title("Positive VS Negative")
fig.delaxes(axes[1,1])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=100000)
X = cv.fit_transform(df["clean_tweet"])
len(cv.get_feature_names_out())

In [None]:
y = df["sentiment"].map({"Negative": 0, "Positive": 1})

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state =0)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler(with_mean=False)
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = LogisticRegression(max_iter=500, solver='saga')
model.fit(x_train_scaled, y_train)

In [None]:
y_pred = model.predict(x_test_scaled)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.4f}%')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix Visualization")
plt.show()

> **Now, let's proceed and predict the sentiment of some sentences from our own input.**

In [None]:
sent = pd.DataFrame({"tweet": ["I am very happy today", "lol, i have depression"]})
sent["clean_tweet"] = sent["tweet"].apply(clean_tweet)
sent

In [None]:
pre = cv.transform(sent["clean_tweet"])
pre = scaler.transform(pre)
predict_sent = model.predict(pre)
predict_sent