In [48]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
     -------------------------------------- 181.2/181.2 kB 3.6 MB/s eta 0:00:00
Collecting python-slugify
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Collecting text-unidecode
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.2/78.2 kB 2.2 MB/s eta 0:00:00
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.7.4.5 python-slugify-8.0.4 text-unidecode-1.3



[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle, json

# ===============================
# 2. Load and Sample Dataset
# ===============================
df = pd.read_csv(
    r"C:\Users\rohan\OneDrive\Documents\training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None
) # header=none means dataset doesnt have a header row if it were 0 it wouldve meant 1st row is considered as header row
df.columns = ["target", "ids", "date", "flag", "user", "text"]

# Balancing the dataset for training(25k negative + 25k positive)
#groupby finds unique values in your dataset (here selected col target which has only 0 as neg and 4 as pos) and sorts those into groups 
#the apply function applies wtv method we input on those groups seperately, lambda is a short form of defining a func in python
#Pandas goes group by group (first all rows where target == 0, then all rows where target == 4). For each group, it passes that group’s dataframe into the function as x.
sample_df = df.groupby("target", group_keys=False).apply(
    lambda x: x.sample(25000, random_state=42)
)


# ===============================
# 3. Download NLTK Resources
# ===============================
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# ===============================
# 4. Text Preprocessing
# ===============================

# 4.1 Lowercase + Tokenization (here sample_df["text"] is a whole data series .lower cannot be applied to it so we need .str after to apply it to single string)
data = sample_df["text"].astype(str).str.lower().apply(word_tokenize)

# 4.2 Remove Stopwords + Punctuation
#“Get me all standard English stopwords from NLTK, and put them in a set for fast lookups.” set is unordered collection of unique objects
stop_words = set(stopwords.words("english")) 
data = data.apply(lambda tokens: [
    w for w in tokens if w not in stop_words and w not in string.punctuation
])

# 4.3 Lemmatization
lemmatizer = WordNetLemmatizer()
data = data.apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])


# ===============================
# 5. TF-IDF Vectorization
# ===============================
# Join tokens back into sentences
data_as_strings = data.apply(lambda tokens: " ".join(tokens))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_as_strings)
y = sample_df["target"]


# ===============================
# 6. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ===============================
# 7. Train Logistic Regression
# ===============================
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("✅ Training complete.")
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

# ===============================
# 8. Model Evaluation
# ===============================
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test set
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report (precision, recall, f1-score per class)
print("\nClassification Report:\n", cr = classification_report(y_test, y_pred, output_dict=True))

# Confusion Matrix
print("\nConfusion Matrix:\n", cm = confusion_matrix(y_test, y_pred).tolist())

import os

metrics = {"classification_report": cr, "confusion_matrix": cm}
with open(os.path.join("C:/Users/rohan/Sentiment_analysis_project", "metrics.json"), "w") as f:
    json.dump(metrics, f)

  sample_df = df.groupby("target", group_keys=False).apply(
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Training complete.
Train size: (40000, 56248)
Test size: (10000, 56248)
Accuracy: 0.7522

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.75      5000
           4       0.74      0.77      0.76      5000

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000


Confusion Matrix:
 [[3671 1329]
 [1149 3851]]


In [3]:
import joblib

# Save model
joblib.dump(model, "sentiment_model.pkl")

# Save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [7]:
import streamlit as st
import pickle
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib


# Load trained model + vectorizer
model = joblib.load("sentiment_model.pkl")         # replace with your model path
vectorizer = joblib.load("tfidf_vectorizer.pkl")   # same vectorizer used during training

st.write("Twitter Sentiment Analysis with AI")
text = st.text_input("Enter your tweet: ")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text): 
    text = text.lower() # u wont need .astype(str).str here cuz its not a pandas series, st.text_input gives single string
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and w not in string.punctuation]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]  # optional but good
    return " ".join(tokens)   # join back into a cleaned string for model input
if st.button("Analyse Sentiment"):
    if text != "":
        cleanedtext = preprocess_text(text)
        tfidf_cleanedtext=vectorizer.transform([cleanedtext]) # must be a list/array
        prediction = model.predict(tfidf_cleanedtext)[0]
        if prediction == 0:
            st.write("The sentiment is negative")
        else:
            st.write("The sentiment is positive")
        
    else: 
        st.write("Input your tweet again")


ModuleNotFoundError: No module named 'streamlit'

In [4]:
import os
print(os.getcwd())


C:\Users\rohan


In [5]:
import os
print(os.listdir())


['.cache', '.ccache', '.gitconfig', '.ipynb_checkpoints', '.ipython', '.jupyter', '.kaggle', '.keras', '.lesshst', '.local', '.matplotlib', '.sage', '.ssh', '.thumbnails', '.viminfo', '.vscode', '25.1.1', 'AIMLnotes.ipynb', 'AppData', 'Application Data', 'coco_labels.txt', 'code', 'Contacts', 'Cookies', 'Course_2.ipynb', 'CrossDevice', 'DocAuth', 'Documents', 'Downloads', 'Favorites', 'IntelGraphicsProfiles', 'Links', 'Local Settings', 'Music', 'My Documents', 'my-app', 'my-appnpm', 'myapprohan', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{21ccdea5-13bd-11f0-a9d2-de5d4462489a}.TM.blf', 'NTUSER.DAT{21ccdea5-13bd-11f0-a9d2-de5d4462489a}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{21ccdea5-13bd-11f0-a9d2-de5d4462489a}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'object_detection_model', 'OneDrive', 'pip', 'practiceapp', 'PrintHood', 'project1', 'Recent', 'rmdir', 'Saved Games', 'Searches', 'SendTo', 'sentimentanalysis.ipynb', 's