## Introduction

In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other


In [None]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


## Data Collection

In [None]:
df=pd.read_csv('IMDB Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


## Data Analysis

In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.duplicated().sum()

np.int64(418)

In [None]:
df.shape

(50000, 2)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
X=df['review']
y=df['sentiment']

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

## Removing HTML tags,Remove URLs, Remove punctuation,Convert to lowercase

In [None]:
import pandas as pd
import re
import string

def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Use .iterrows() to iterate through DataFrame rows
# iterrows() provides both the index and the row data
cleaned_reviews = []
for index, row in df.iterrows():
    cleaned = clean_text(row['review']) # Access 'review' column from the row
    cleaned_reviews.append(cleaned)

# Store the cleaned reviews in a new column
df['review'] = cleaned_reviews

# Display the first few rows
print(df[['review','sentiment']].head())

                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production the filming tech...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically theres a family where a little boy j...  negative
4  petter matteis love in the time of money is a ...  positive


## Tokenization

In [None]:
df['tokens'] = df['review'].apply(lambda x: x.split())

In [None]:
print(df['tokens'].head())

0    [one, of, the, other, reviewers, has, mentione...
1    [a, wonderful, little, production, the, filmin...
2    [i, thought, this, was, a, wonderful, way, to,...
3    [basically, theres, a, family, where, a, littl...
4    [petter, matteis, love, in, the, time, of, mon...
Name: tokens, dtype: object


In [None]:
pip install nltk



In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Stemming

In [None]:
ps = PorterStemmer()

# Function to stem each word in a tokenized review
def stem_tokens(tokens):
    return [ps.stem(word) for word in tokens]

# Assuming you already have 'tokens' column
df['stemmed_tokens'] = df['tokens'].apply(stem_tokens)


In [None]:
print(df['stemmed_tokens'].head())

0    [one, of, the, other, review, ha, mention, tha...
1    [a, wonder, littl, product, the, film, techniq...
2    [i, thought, thi, wa, a, wonder, way, to, spen...
3    [basic, there, a, famili, where, a, littl, boy...
4    [petter, mattei, love, in, the, time, of, mone...
Name: stemmed_tokens, dtype: object


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the vectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(df['review'])

# See the feature names (vocabulary)
print(vectorizer.get_feature_names_out()[:10])  # first 10 words

# See the vectorized output shape
print(X.shape)  # (rows, unique words)


['00' '000' '0000000000001' '00000001' '000001' '0001' '00015' '001'
 '0010' '002']
(49582, 221252)


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the cleaned reviews
X_tfidf = tfidf.fit_transform(df['review'])

# View the shape
print(X_tfidf.shape)  # (number of reviews, number of unique words)

# See the top 10 words in vocabulary
print(tfidf.get_feature_names_out()[:10])


(49582, 221252)
['00' '000' '0000000000001' '00000001' '000001' '0001' '00015' '001'
 '0010' '002']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score # Changed accuracy_scor to accuracy_score

In [None]:
# Features and target
X = X_tfidf                           # TF-IDF vectors
y = df['sentiment']                  # Target column ('positive' or 'negative')

# Split into training and testing (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model Development

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## Monitoring

In [None]:
y_pred = model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8922053040233942

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4939
    positive       0.88      0.91      0.89      4978

    accuracy                           0.89      9917
   macro avg       0.89      0.89      0.89      9917
weighted avg       0.89      0.89      0.89      9917

