In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
! pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
replace training.1600000.processed.noemoticon.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd

df = pd.read_csv("training.1600000.processed.noemoticon.csv",
                 encoding="ISO-8859-1", header=None)

df.columns = ["target", "id", "date", "flag", "user", "text"]
df.head()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['target'].value_counts()

In [None]:
df.replace({'target':{4:1}}, inplace=True)

In [None]:
df['target'].value_counts()

In [None]:
#stemming
port_stem=PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]',' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [None]:
df['stemmed_content']=df['text'].apply(stemming)

In [None]:
df.head()

In [None]:
print(df['stemmed_content'])

In [None]:
print(df['target'])

In [None]:
X=df['stemmed_content'].values
Y=df['target'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
model=LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train,Y_train)

In [None]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [None]:
print('Accuracy score on the trainning data:', training_data_accuracy)

In [None]:
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [None]:
print('Accuracy score on the training data:',test_data_accuracy)

In [None]:
import pickle
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))

In [None]:
loaded_model=pickle.load(open('/content/trained_model.sav','rb'))

In [None]:
X_new=X_test[200]
print(Y_test[200])
prediction=loaded_model.predict(X_new)
print(prediction)
if(prediction[0]==0):
  print('Negative Tweet')
else:
  print('Positive Tweet')

In [None]:
df["date"]=pd.to_datetime(df["date"],errors="coerce")

In [None]:
# Group by date and sentiment count
sentiment_trend = df.groupby([df["date"].dt.date, "sentiment"]).size().unstack(fill_value=0)

# Sort by date
sentiment_trend = sentiment_trend.sort_index()

sentiment_trend.head()

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(sentiment_trend.index, sentiment_trend["Positive"], label="Positive", color="green")
plt.plot(sentiment_trend.index, sentiment_trend["Negative"], label="Negative", color="red")

plt.title("Sentiment Trends Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Resample by week
sentiment_weekly = sentiment_trend.resample("W").sum()

plt.figure(figsize=(14,6))
plt.plot(sentiment_weekly.index, sentiment_weekly["Positive"], label="Positive", color="green")
plt.plot(sentiment_weekly.index, sentiment_weekly["Negative"], label="Negative", color="red")
plt.title("Weekly Sentiment Trends")
plt.xlabel("Week")
plt.ylabel("Number of Tweets")
plt.legend()
plt.grid(True)
plt.show()
