# Task 2: Text Sentiment Analysis
This notebook demonstrates sentiment analysis using the IMDB (or similar) dataset with preprocessing, TF-IDF feature extraction, Logistic Regression model training, and evaluation.

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# NLTK resources download
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Dataset load
df = pd.read_csv('IMDB Dataset.csv', nrows=10000)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [7]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    # HTML tags removal
    text = re.sub(r'<.*?>', '', text)
    # Non-alphabetic characters removal
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenization
    words = text.split()
    # Stopwords removal and lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['clean_review'] = df['review'].apply(preprocess_text)
df


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
...,...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive,fun entertaining movie wwii german spy julie a...
9996,Give me a break. How can anyone say that this ...,negative,give break anyone say good hockey movie know m...
9997,This movie is a bad movie. But after watching ...,negative,movie bad movie watching endless series bad ho...
9998,This is a movie that was probably made to ente...,negative,movie probably made entertain middle school ea...


In [10]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_review'])

In [11]:
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])  # 'positive' -> 1, 'negative' -> 0

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [14]:
# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      0.84      0.86       996
           1       0.85      0.89      0.87      1004

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



In [22]:
def predict_sentiment(review):
    processed = preprocess_text(review)
    vectorized = tfidf.transform([processed])
    prediction = model.predict(vectorized)
    return le.inverse_transform(prediction)[0]

# Example usage
print(predict_sentiment("I absolutely hate this movie but om shanti om is better!"))


negative
