# **Importing**

In [175]:
import os
import pandas as pd
import string
from textblob import TextBlob
import spacy
from collections import Counter
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re

In [176]:
train_dir = "/kaggle/input/aclimdb/aclImdb/train"
test_dir = "/kaggle/input/aclimdb/aclImdb/test"

In [177]:
def load_data(data_dir):
    reviews = []
    sentiments = []
    for sentiment in ['pos', 'neg']:
        sentiment_dir = os.path.join(data_dir, sentiment)
        for filename in os.listdir(sentiment_dir):
            if filename.endswith('.txt'):
                with open(os.path.join(sentiment_dir, filename), 'r', encoding='utf-8') as file:
                    reviews.append(file.read())
                    sentiments.append(1 if sentiment == 'pos' else 0)
    return pd.DataFrame({'review': reviews, 'sentiment': sentiments})

In [178]:
train_data = load_data(train_dir)
test_data = load_data(test_dir)

In [179]:
print("Training Data:")
print(train_data.head())

print("\nTesting Data:")
print(test_data.head())

Training Data:
                                              review  sentiment
0  This was one of those wonderful rare moments i...          1
1  Have you seen The Graduate? It was hailed as t...          1
2  I don't watch a lot of TV, except for The Offi...          1
3  Kubrick again puts on display his stunning abi...          1
4  First of all, I liked very much the central id...          1

Testing Data:
                                              review  sentiment
0  I've Seen The Beginning Of The Muppet Movie, B...          1
1  If it had been made 2 years later it would hav...          1
2  Very good "Precoder" starring Dick Barthelmess...          1
3  A young man discovers that life is precious af...          1
4  I'm always surprised, given that the famous ti...          1


In [180]:
print("Data type of train_df:", type(train_data))
print("Data type of test_df:", type(test_data))


Data type of train_df: <class 'pandas.core.frame.DataFrame'>
Data type of test_df: <class 'pandas.core.frame.DataFrame'>


In [181]:
print("Columns in train_data:", train_data.columns)
print("Columns in test_data:", test_data.columns)

Columns in train_data: Index(['review', 'sentiment'], dtype='object')
Columns in test_data: Index(['review', 'sentiment'], dtype='object')


# **Check for Missing values**

In [182]:
print("Missing values in training data:")
print(train_data.isnull().sum())

print("Missing values in testing data:")
print(test_data.isnull().sum())

Missing values in training data:
review       0
sentiment    0
dtype: int64
Missing values in testing data:
review       0
sentiment    0
dtype: int64


# **Conversion to Lower Case**

In [183]:
train_data['review'][5]

"It was a bit bizarre and evil and i enjoyed it a lot, the characters in the show were great as well, and complimented one another well. I was sorry to see it cut off.. I would have loved to see where it could have went.You found yourself leaning toward Lucas Buck the sheriff who had more secrets than anyone. Lucas was frightening and alluring. And I would have liked to have seen more of him and how his character became. I will however buy the show just to enjoy, it was great to something different on TV. And Paige Turrco who was Caleb's cousin, she was a big mystery as to where and what she meant to Lucas. Its a shame it isn't around still.. or was never finished, i would have loved to see what would have happened."

In [184]:
train_data['review'] = train_data['review'].str.lower()
test_data['review'] = test_data['review'].str.lower()

print("Training data after convering to lower case")
train_data.head()

print("Test data after converting to lower case")
test_data.head()

Training data after convering to lower case
Test data after converting to lower case


Unnamed: 0,review,sentiment
0,"i've seen the beginning of the muppet movie, b...",1
1,if it had been made 2 years later it would hav...,1
2,"very good ""precoder"" starring dick barthelmess...",1
3,a young man discovers that life is precious af...,1
4,"i'm always surprised, given that the famous ti...",1


In [185]:
train_data['review'][5]

"it was a bit bizarre and evil and i enjoyed it a lot, the characters in the show were great as well, and complimented one another well. i was sorry to see it cut off.. i would have loved to see where it could have went.you found yourself leaning toward lucas buck the sheriff who had more secrets than anyone. lucas was frightening and alluring. and i would have liked to have seen more of him and how his character became. i will however buy the show just to enjoy, it was great to something different on tv. and paige turrco who was caleb's cousin, she was a big mystery as to where and what she meant to lucas. its a shame it isn't around still.. or was never finished, i would have loved to see what would have happened."

# **Removing HTML Tags**

In [186]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [187]:
train_data['review'] = train_data['review'].apply(remove_html_tags)
test_data['review'] = test_data['review'].apply(remove_html_tags)

In [188]:
train_data.head()

Unnamed: 0,review,sentiment
0,this was one of those wonderful rare moments i...,1
1,have you seen the graduate? it was hailed as t...,1
2,"i don't watch a lot of tv, except for the offi...",1
3,kubrick again puts on display his stunning abi...,1
4,"first of all, i liked very much the central id...",1


# **Removing Punctuation Marks**

In [189]:
exclude = string.punctuation
print(exclude)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [190]:
def removePunc(text):
    return text.translate(str.maketrans('','',exclude))

In [191]:
removePunc(train_data['review'][5])

'it was a bit bizarre and evil and i enjoyed it a lot the characters in the show were great as well and complimented one another well i was sorry to see it cut off i would have loved to see where it could have wentyou found yourself leaning toward lucas buck the sheriff who had more secrets than anyone lucas was frightening and alluring and i would have liked to have seen more of him and how his character became i will however buy the show just to enjoy it was great to something different on tv and paige turrco who was calebs cousin she was a big mystery as to where and what she meant to lucas its a shame it isnt around still or was never finished i would have loved to see what would have happened'

In [192]:
train_data['review']=train_data['review'].apply(removePunc)
test_data['review']=test_data['review'].apply(removePunc)

In [193]:
train_data.head()

Unnamed: 0,review,sentiment
0,this was one of those wonderful rare moments i...,1
1,have you seen the graduate it was hailed as th...,1
2,i dont watch a lot of tv except for the office...,1
3,kubrick again puts on display his stunning abi...,1
4,first of all i liked very much the central ide...,1


In [194]:
test_data.head()

Unnamed: 0,review,sentiment
0,ive seen the beginning of the muppet movie but...,1
1,if it had been made 2 years later it would hav...,1
2,very good precoder starring dick barthelmess w...,1
3,a young man discovers that life is precious af...,1
4,im always surprised given that the famous titl...,1


# **Removing Extra whitespace from reviews**

In [195]:
train_data['review'] = train_data['review'].str.strip()
test_data['review'] = test_data['review'].str.strip()

In [196]:
train_data.head()

Unnamed: 0,review,sentiment
0,this was one of those wonderful rare moments i...,1
1,have you seen the graduate it was hailed as th...,1
2,i dont watch a lot of tv except for the office...,1
3,kubrick again puts on display his stunning abi...,1
4,first of all i liked very much the central ide...,1


In [197]:
test_data.head()

Unnamed: 0,review,sentiment
0,ive seen the beginning of the muppet movie but...,1
1,if it had been made 2 years later it would hav...,1
2,very good precoder starring dick barthelmess w...,1
3,a young man discovers that life is precious af...,1
4,im always surprised given that the famous titl...,1


# **Tokenizing, Removing stop words and Lemmatization**

In [198]:
spacy.require_gpu()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [199]:
def tokenize_reviews(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_space]

In [200]:
train_data['review_tokens'] = train_data['review'].apply(tokenize_reviews)
test_data['review_tokens'] = test_data['review'].apply(tokenize_reviews)

In [201]:
train_data['review_tokens'][5]

['it',
 'was',
 'a',
 'bit',
 'bizarre',
 'and',
 'evil',
 'and',
 'i',
 'enjoyed',
 'it',
 'a',
 'lot',
 'the',
 'characters',
 'in',
 'the',
 'show',
 'were',
 'great',
 'as',
 'well',
 'and',
 'complimented',
 'one',
 'another',
 'well',
 'i',
 'was',
 'sorry',
 'to',
 'see',
 'it',
 'cut',
 'off',
 'i',
 'would',
 'have',
 'loved',
 'to',
 'see',
 'where',
 'it',
 'could',
 'have',
 'wentyou',
 'found',
 'yourself',
 'leaning',
 'toward',
 'lucas',
 'buck',
 'the',
 'sheriff',
 'who',
 'had',
 'more',
 'secrets',
 'than',
 'anyone',
 'lucas',
 'was',
 'frightening',
 'and',
 'alluring',
 'and',
 'i',
 'would',
 'have',
 'liked',
 'to',
 'have',
 'seen',
 'more',
 'of',
 'him',
 'and',
 'how',
 'his',
 'character',
 'became',
 'i',
 'will',
 'however',
 'buy',
 'the',
 'show',
 'just',
 'to',
 'enjoy',
 'it',
 'was',
 'great',
 'to',
 'something',
 'different',
 'on',
 'tv',
 'and',
 'paige',
 'turrco',
 'who',
 'was',
 'calebs',
 'cousin',
 'she',
 'was',
 'a',
 'big',
 'mystery',


# **Removing Stop words from tokenized review**

In [202]:
def remove_stopwords(tokens):
    doc = nlp(' '.join(tokens))
    tokens_without_stopwords = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return tokens_without_stopwords

In [203]:
train_data['review_tokens'] = train_data['review_tokens'].apply(remove_stopwords)
test_data['review_tokens'] = test_data['review_tokens'].apply(remove_stopwords)

In [204]:
train_data['review_tokens'][5]

['bit',
 'bizarre',
 'evil',
 'enjoyed',
 'lot',
 'characters',
 'great',
 'complimented',
 'sorry',
 'cut',
 'loved',
 'wentyou',
 'found',
 'leaning',
 'lucas',
 'buck',
 'sheriff',
 'secrets',
 'lucas',
 'frightening',
 'alluring',
 'liked',
 'seen',
 'character',
 'buy',
 'enjoy',
 'great',
 'different',
 'tv',
 'paige',
 'turrco',
 'calebs',
 'cousin',
 'big',
 'mystery',
 'meant',
 'lucas',
 'shame',
 'nt',
 'finished',
 'loved',
 'happened']

# **Lemmatization**

In [205]:
def lemmatize(tokens):
    doc = nlp(' '.join(tokens))
    lemmas = [token.lemma_ for token in doc if not token.is_stop]
    return lemmas

In [206]:
train_data['lemmatize_tokens'] = train_data['review_tokens'].apply(lemmatize)
test_data['lemmatize_tokens'] = test_data['review_tokens'].apply(lemmatize)

In [207]:
train_data['lemmatize_tokens'][5]

['bit',
 'bizarre',
 'evil',
 'enjoy',
 'lot',
 'character',
 'great',
 'compliment',
 'sorry',
 'cut',
 'love',
 'wentyou',
 'find',
 'lean',
 'lucas',
 'buck',
 'sheriff',
 'secret',
 'lucas',
 'frightening',
 'alluring',
 'like',
 'see',
 'character',
 'buy',
 'enjoy',
 'great',
 'different',
 'tv',
 'paige',
 'turrco',
 'calebs',
 'cousin',
 'big',
 'mystery',
 'mean',
 'lucas',
 'shame',
 'not',
 'finish',
 'love',
 'happen']

# **Reconstruction and storing**

In [208]:
train_data['cleaned_review'] = train_data['lemmatize_tokens'].apply(lambda tokens: " ".join(tokens))
test_data['cleaned_review'] = test_data['lemmatize_tokens'].apply(lambda tokens: " ".join(tokens))

In [209]:
train_data['cleaned_review'][5]

'bit bizarre evil enjoy lot character great compliment sorry cut love wentyou find lean lucas buck sheriff secret lucas frightening alluring like see character buy enjoy great different tv paige turrco calebs cousin big mystery mean lucas shame not finish love happen'

# **Saving the processed data**

In [229]:
train_data.to_csv('processed_train_data2.csv', index=False)
test_data.to_csv('processed_test_data2.csv', index=False)

# **Word Frequency**

In [211]:
def tokenize_text(text):
    return text.split()

In [212]:
word_counts = Counter()
train_data['cleaned_review'].apply(lambda x: word_counts.update(tokenize_text(x)))

0        None
1        None
2        None
3        None
4        None
         ... 
24995    None
24996    None
24997    None
24998    None
24999    None
Name: cleaned_review, Length: 25000, dtype: object

In [213]:
print("Most common words:")
for word, count in word_counts.most_common(10):
    print(f"{word}: {count}")

Most common words:
movie: 49541
film: 46155
not: 29044
like: 21618
good: 19556
time: 14324
character: 13778
watch: 13453
s: 12736
story: 12606


# **Bigrams**

In [214]:
def generate_bigrams(tokens):
    return list(ngrams(tokens, 2))

In [215]:
bigram_counts_train = Counter()
train_data['cleaned_review'].apply(lambda x: bigram_counts_train.update(generate_bigrams(tokenize_text(x))))

0        None
1        None
2        None
3        None
4        None
         ... 
24995    None
24996    None
24997    None
24998    None
24999    None
Name: cleaned_review, Length: 25000, dtype: object

In [216]:
print("Most common bigrams in training data:")
for bigram, count in bigram_counts_train.most_common(10):
    print(f"{bigram}: {count}")

Most common bigrams in training data:
('ve', 'see'): 2033
('look', 'like'): 1846
('not', 'know'): 1605
('watch', 'movie'): 1511
('movie', 'not'): 1358
('bad', 'movie'): 1357
('will', 'not'): 1295
('good', 'movie'): 1093
('special', 'effect'): 1086
('movie', 'like'): 1085


In [217]:
bigram_counts_test = Counter()
test_data['cleaned_review'].apply(lambda x: bigram_counts_test.update(generate_bigrams(tokenize_text(x))))

0        None
1        None
2        None
3        None
4        None
         ... 
24995    None
24996    None
24997    None
24998    None
24999    None
Name: cleaned_review, Length: 25000, dtype: object

In [218]:
print("Most common bigrams in testing data:")
for bigram, count in bigram_counts_test.most_common(10):
    print(f"{bigram}: {count}")

Most common bigrams in testing data:
('ve', 'see'): 1983
('look', 'like'): 1959
('not', 'know'): 1662
('watch', 'movie'): 1541
('movie', 'not'): 1345
('will', 'not'): 1336
('bad', 'movie'): 1304
('special', 'effect'): 1113
('see', 'movie'): 1083
('good', 'movie'): 1071


# **Converting the cleaned text data into a document-term matrix**

In [219]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000,
                                   token_pattern=r'\b[a-zA-Z]{3,}\b',
                                   )

In [220]:
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_data['cleaned_review'])
tfidf_matrix_train_df = pd.DataFrame(tfidf_matrix_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [221]:
tfidf_matrix_test = tfidf_vectorizer.fit_transform(test_data['cleaned_review'])
tfidf_matrix_test_df = pd.DataFrame(tfidf_matrix_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [222]:
tfidf_matrix_train_df.head()

Unnamed: 0,ability,able,absolutely,accent,accept,act,acting,action,actor,actress,...,write,writer,writing,wrong,yeah,year,yes,york,young,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033378,0.0,0.0,0.038893,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.150781,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.207042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111276,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [223]:
tfidf_matrix_test_df.head()

Unnamed: 0,ability,able,absolutely,accent,accept,act,acting,action,actor,actress,...,write,writer,writing,wrong,yeah,year,yes,york,young,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.105603,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.170936,0.0,0.0,0.0,0.140498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127305,0.0
4,0.0,0.094177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Naive Bayes classifier**

In [224]:
X_train = tfidf_matrix_train
y_train = train_data['sentiment']
X_test = tfidf_matrix_test
y_test = test_data['sentiment']

In [225]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

# **Accuracy, Precision, Recall, and F1 score**

In [226]:
y_pred = clf.predict(X_test)

In [227]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.72232


In [228]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73     12500
           1       0.73      0.70      0.72     12500

    accuracy                           0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000

