In [1]:
import pandas as pd
import nltk
import string
import re 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('IMDB Dataset.csv')
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Basic Information about the dataset

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     500 non-null    object
 1   sentiment  500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [5]:
data.shape

(500, 2)

# Text Preprocessing:

Text Preprocessing Steps
 1. Remove special characters and numbers
 2. Convert all text to lower case
 3. Remove stop words (common words like 'the', 'and', etc.)
 4. Lemmatize words (reduce words to their base form)


In [6]:
data['sentiment'].value_counts()

sentiment
negative    263
positive    237
Name: count, dtype: int64

In [7]:
data.dropna(inplace=True)

In [8]:
data['review'][433]

'In 1929, director Walt Disney and animator Ub Iwerks changed the face of animation with the release of the very first installment of their "Silly Symphonies" series, "The Skeleton Dance". Iwerks and Disney had been collaborating together since the early 20s, in Disney\'s "Laugh-O-Gram" cartoon series; however, their friendship suffered a tremendous blow when Iwerks accepted an offer by a competitor to leave Disney and start his own animation studio. That was the birth of Celebrity Productions, where Iwerks continued developing his style and technique (and where he created the character of Flip the Frog). While his work kept the same high quality, it wasn\'t really popular and by 1936 the studio was closed. Later that year, Iwerks was hired by Columbia Pictures, and Iwerks decided to return to his old skeletons for another dance, this time in color.1937\'s "Skeleton Frolics" is essentially, a remake of the 1929 classic "The Skeleton Dance", the movie that borough him fame and fortune. 

In [9]:
# Map sentiment to binary values directly
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

# Display the first few rows
print(data['sentiment'].head())


0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64


In [10]:
data.sample(5)

Unnamed: 0,review,sentiment
368,One of my favorite movies which has been overl...,1
258,Black comedy isn't always an easy sell. Every ...,1
384,Pathetic attempt to use science to justify new...,0
10,Phil the Alien is one of those quirky films wh...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [12]:
stopwords_set = set(stopwords.words('english'))
emoji_pattern = re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)')

def preprocessing(text):
    text = re.sub('<[^>]*>', '', text)
    emojis = emoji_pattern.findall(text)
    text = re.sub('[\W+]', ' ', text.lower()) + ' '.join(emojis).replace('-', '')

    prter = PorterStemmer()
    text = [prter.stem(word) for word in text.split() if word not in stopwords_set]

    return " ".join(text)

  emoji_pattern = re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)')
  text = re.sub('[\W+]', ' ', text.lower()) + ' '.join(emojis).replace('-', '')


In [13]:
data['review'] = data['review'].apply(lambda x: preprocessing(x))


In [14]:
data['review'][89]

'hollywood movi industri laziest one entir world need singl hit flood theater old crap invent take superhero exampl x man spiderman daredevil elektra ghost rider hulk japanes horror remak even worst took ring pitch perfect mostli mr gore verbinski bring ton look alik creepi woman base horror e g ring 2 eye dark water fine pointless grudg first grudg entir bad scari way one could expect plu plot brain mix narr grudg 2 exactli like previou could good thing hey boy men black ii nice thing xerox entir screenplay chang villain grudg 2 critic goe way tire scare bad act except amber tamblyn cliché place three stori take place differ place time aubrey tamblyn investig drove sister karen sarah michel gellar death allison ariel kebbel taken colleagu visit hous incid depict first movi took place final american famili wit strang stuff happen apart next door glad say mean everyth tie end one must reli end make good pictur everyth els simpli tiresom dull chill girl alon locker someon enter hous othe

# Feature Engineering

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,use_idf=True,norm='l2',smooth_idf=True)
y=data['sentiment']
x=tfidf.fit_transform(data['review'])

# Model Training

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.5,shuffle=False)

In [17]:
from sklearn.linear_model import LogisticRegressionCV
clf=LogisticRegressionCV(cv=6,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=500).fit(X_train,y_train)
y_pred = clf.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    4.9s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    5.0s finished


In [18]:
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.724


In [19]:
import pickle
model = pickle.dump(clf,open('clf.pkl','wb'))
vectorizer= pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [23]:
def prediction(comment):
    preprocessed_comment = preprocessing(comment)
    comment_list = [preprocessed_comment]  # Wrap the preprocessed comment in a list
    comment_vector = tfidf.transform(comment_list)
    prediction = clf.predict(comment_vector)[0]
    return prediction



prediction = prediction('it is excellent movie')

In [24]:
if prediction == 1:
    print("positive comment")
else:
    print("negative comment")
    

positive comment


# Model Evaluation

In [22]:
from sklearn.metrics import classification_report

# True labels , 1 = Positive, 0 = Negative
y_test = [1, 0, 1, 0, 1]  

# Model predictions
y_pred = [1, 0, 1, 1, 1]  

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.75      1.00      0.86         3

    accuracy                           0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5

