In [1]:
import pandas as pd
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Convert to Lowercase

In [2]:
df['lowercase'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment,lowercase
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"petter mattei's ""love in the time of money"" is..."


##Removal of Punctuations


In [3]:
import string
print(string.punctuation)
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [4]:
df['nopuncs'] = df['lowercase'].apply(lambda x:remove_punctuations(x))


In [5]:
df = df.drop(columns= ['review','lowercase'],axis= 1)

df.head()



Unnamed: 0,sentiment,nopuncs
0,positive,one of the other reviewers has mentioned that ...
1,positive,a wonderful little production br br the filmin...
2,positive,i thought this was a wonderful way to spend ti...
3,negative,basically theres a family where a little boy j...
4,positive,petter matteis love in the time of money is a ...


In [6]:
def remove_brbr(text):
    return text.replace("br br", "")

df['nopuncs'] = df['nopuncs'].apply(lambda x:remove_brbr(x))





In [7]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [8]:
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['nopuncs'] = df['nopuncs'].apply(lambda x:remove_stopwords(x))

REMOVAL OF FREQUENT WORDS

In [10]:
from collections import Counter
word_count = Counter()
for text in df['nopuncs']:
    for word in text.split():
        word_count[word] += 1

word_count.most_common(20)

[('movie', 85223),
 ('film', 76028),
 ('one', 51476),
 ('like', 39063),
 ('good', 28912),
 ('even', 24582),
 ('would', 24036),
 ('time', 23960),
 ('really', 23011),
 ('see', 22640),
 ('story', 22514),
 ('well', 19228),
 ('much', 19110),
 ('get', 18244),
 ('bad', 17989),
 ('great', 17934),
 ('also', 17857),
 ('people', 17710),
 ('first', 17206),
 ('dont', 16925)]

In [11]:
FREQUENT_WORDS = set(['movie','film','story'])

def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [12]:
df['nopuncs'] = df['nopuncs'].apply(lambda x:remove_freq_words(x))

REMOVAL OF RARE QWORDS

In [13]:
RARE_WORDS = set(word for (word,wc) in word_count.most_common()[:-10:-1])
print(RARE_WORDS)

{'studentsthe', 'wasamwill', 'jossi', 'ohsohard', 'ashknenazi', 'frenchonly', 'effortful', 'clatter', 'horriblecatwoman'}


In [14]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

df['nopuncs'] = df['nopuncs'].apply(lambda x:remove_rare_words(x))

REMOVAL OF SPECIAL CHARACTERS

In [15]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]',' ',text)
    text = re.sub('\s+',' ',text)
    return text

In [16]:
df['nopuncs'] = df['nopuncs'].apply(lambda x:remove_spl_chars(x))

LEMMATIZATION

In [17]:
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ,"R":wordnet.ADV} 





In [19]:
def lemmatize_words(text):
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word,wordnet_map.get(pos[0],wordnet.NOUN)) for word,pos in pos_text])

In [22]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rajga\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rajga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [23]:
df['nopuncs'] = df['nopuncs'].apply(lambda x: lemmatize_words(x))

df.head()

Unnamed: 0,sentiment,nopuncs
0,positive,one reviewer mention watch 1 oz episode youll ...
1,positive,wonderful little production film technique una...
2,positive,think wonderful way spend time hot summer week...
3,negative,basically there family little boy jake think t...
4,positive,petter matteis love time money visually stunni...


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['nopuncs'])


In [30]:
print("Shape of the TF-IDF matrix:", X.shape)
print("\nVocabulary (the learned features):")
print(tfidf.get_feature_names_out())

Shape of the TF-IDF matrix: (50000, 5000)

Vocabulary (the learned features):
['10' '100' '1000' ... 'zombie' 'zone' 'zoom']


In [31]:
from sklearn.model_selection import train_test_split
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)




In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
nb_model = MultinomialNB()

In [34]:
nb_model.fit(X_train, y_train)

In [35]:
y_pred = nb_model.predict(X_test)


In [36]:
from sklearn.metrics import classification_report, confusion_matrix

# --- Previous Step: The model is already trained ---
# nb_model.fit(X_train, y_train)

# --- This Step: Evaluation ---

# 1. Make predictions on the test set
y_pred = nb_model.predict(X_test)

# 2. Print the classification report
print("## Classification Report ##")
print(classification_report(y_test, y_pred))

# 3. Print the confusion matrix
print("\n## Confusion Matrix ##")
print(confusion_matrix(y_test, y_pred))

## Classification Report ##
              precision    recall  f1-score   support

    negative       0.86      0.85      0.85      4961
    positive       0.85      0.86      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


## Confusion Matrix ##
[[4203  758]
 [ 708 4331]]


In [37]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# --- Train an SVM Model ---
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
# Now evaluate svm_model just like you did before

# --- Train a Logistic Regression Model ---
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
# Now evaluate lr_model



In [38]:
y_pred_svm = svm_model.predict(X_test)

# 2. Print the classification report
print("## Classification Report ##")
print(classification_report(y_test, y_pred_svm))

# 3. Print the confusion matrix
print("\n## Confusion Matrix ##")
print(confusion_matrix(y_test, y_pred_svm))

## Classification Report ##
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.87      0.89      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000


## Confusion Matrix ##
[[4309  652]
 [ 541 4498]]


In [39]:
y_pred_lr = lr_model.predict(X_test)

# 2. Print the classification report
print("## Classification Report ##")
print(classification_report(y_test, y_pred_lr))

# 3. Print the confusion matrix
print("\n## Confusion Matrix ##")
print(confusion_matrix(y_test, y_pred_lr))

## Classification Report ##
              precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


## Confusion Matrix ##
[[4336  625]
 [ 500 4539]]


In [40]:
y_mine = lr_model.predict("good found very funny and cool")
print(y_mine)

ValueError: Expected 2D array, got scalar array instead:
array=good found very funny and cool.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.