In [1]:
# 📌 1. Install required packages
!pip install pandas scikit-learn nltk joblib

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
      --------------------------------------- 0.0/1.5 MB 640.0 kB/s eta 0:00:03
      --------------------------------------- 0.0/1.5 MB 259.2 kB/s eta 0:00:06
      --------------------------------------- 0.0/1.5 MB 259.2 kB/s eta 0:00:06
      --------------------------------------- 0.0/1.5 MB 259.2 kB/s eta 0:00:06
      --------------------------------------- 0.0/1.5 MB 259.2 kB/s eta 0:00:06
      ------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# 📌 2. Import libraries
import os
import sys
import random
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib


In [3]:
# 📌 3. Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\SUSHIL
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SUSHIL
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 📌 4. Define Preprocessing Function (from your code)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)               # remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text)         # keep only alphabets
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)



In [5]:
# 📌 5. Load IMDb dataset
def load_reviews_from_dir(directory, label):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), encoding="utf-8") as f:
            text = f.read()
            data.append((text, label))
    return data

pos_dir = r"C:\Users\SUSHIL KUMAR\Desktop\dvp_final\datasets\imdb\train\pos"
neg_dir = r"C:\Users\SUSHIL KUMAR\Desktop\dvp_final\datasets\imdb\train\neg"

train_data = load_reviews_from_dir(pos_dir, 1) + load_reviews_from_dir(neg_dir, 0)
random.shuffle(train_data)

train_df = pd.DataFrame(train_data, columns=["review", "label"])

# Apply preprocessing
train_df["clean"] = train_df["review"].apply(preprocess_text)

print("Sample cleaned review:\n", train_df["clean"].iloc[0])


Sample cleaned review:
 second coming suzanne yet another one surrealistic film try come across extremely sophisticated yet put viewer sleep like movie type limited dialogue everyone much interested visual aesthetic shot however cinematography stink nothing keep attention video box state film one exciting visual adventure ever seen film yeah sure right watching stimulating event bucket ice melting turtle walking mile


In [6]:
# 📌 6. Vectorize (TF-IDF) & Train Logistic Regression
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(train_df["clean"])
y = train_df["label"]

model = LogisticRegression(max_iter=200)
model.fit(X, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [7]:
# 📌 8. Quick test
sample_text = "I really loved this movie, it was amazing and inspiring!"
cleaned = preprocess_text(sample_text)
vectorized = tfidf.transform([cleaned])
pred = model.predict(vectorized)[0]
print("Prediction:", "Positive" if pred == 1 else "Negative")


Prediction: Positive


In [8]:
# 📌 1. Install required packages (if not already installed)
!pip install scikit-learn nltk joblib

# 📌 2. Import libraries
import os
import random
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

# 📌 3. Download NLTK resources (first run only)
nltk.download("stopwords")
nltk.download("wordnet")

# 📌 4. Define preprocessing function
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)          # remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text)     # keep only alphabets
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

# 📌 5. Load IMDb dataset
def load_reviews_from_dir(directory, label):
    data = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), encoding="utf-8") as f:
            text = f.read()
            data.append((text, label))
    return data

pos_dir = r"C:\Users\SUSHIL KUMAR\Desktop\dvp_final\datasets\imdb\train\pos"
neg_dir = r"C:\Users\SUSHIL KUMAR\Desktop\dvp_final\datasets\imdb\train\neg"

train_data = load_reviews_from_dir(pos_dir, 1) + load_reviews_from_dir(neg_dir, 0)
random.shuffle(train_data)

train_df = pd.DataFrame(train_data, columns=["review", "label"])

# Apply preprocessing
train_df["clean"] = train_df["review"].apply(preprocess_text)

print("✅ Sample cleaned review:\n", train_df["clean"].iloc[0])

# 📌 6. Features and Labels
X = train_df["clean"]
y = train_df["label"]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 📌 7. TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 📌 8. Logistic Regression Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# 📌 9. Predictions
y_pred = clf.predict(X_test_tfidf)

# 📌 10. Evaluation
print("\n📊 Sentiment Analysis Results:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




[nltk_data] Downloading package stopwords to C:\Users\SUSHIL
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SUSHIL
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Sample cleaned review:
 bobby goofy kid smile far much want sex buy van aid quest acting lame comedy pathetic script loosely strung chain clich cheap thrill maker film obviously wanted capture craziness film time fell long way short even resort bobby slipping banana skin supposedly add comedic value struggling find redeeming feature film like devito another classic devito kind role supporting actor clich value

📊 Sentiment Analysis Results:
Accuracy : 0.8766
Precision: 0.8611430763329497
Recall   : 0.898
F1-score : 0.8791854317603289

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.86      0.87      2500
           1       0.86      0.90      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [10]:
# 📌 11. Save Model & Vectorizer
import os, joblib

os.makedirs("models", exist_ok=True)

joblib.dump(clf, "models/sentiment_model.joblib")
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")

print("✅ Sentiment model and TF-IDF vectorizer saved in models/ folder")


✅ Sentiment model and TF-IDF vectorizer saved in models/ folder
