Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
import pandas as pd


In [None]:

df = pd.read_csv("/content/IMDB Dataset.csv", on_bad_lines='skip')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Problem 1

# Apply all the preprocessing techniques that you think are necessary

In [None]:
df['review'] = df['review'].str.lower()

In [None]:
import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_html_tags)

In [None]:
# remove urls
def remove_urls(text):
  pattern = re.compile(r'https?://\s+|www\.\s+')
  return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_urls)

In [None]:
# remove punctuations

import string
punct = string.punctuation

def remove_punctuations(text):
  for char in punct:
    text = text.replace(char, '')
  return text

df['review'] = df['review'].apply(remove_punctuations)

In [None]:
from os import remove
# another method to remove punctuations

def remove_punct2(text):
  return text.translate(str.maketrans('', '', punct))

df['review'] = df['review'].apply(remove_punct2)

In [None]:
# chat word treatment
chat_words = {
    "U": "you",
    "BRB": "be right back",
    "THX": "thanks",
    "TMRW": "tomorrow",
    "GUD": "good",
    "WANNA": "want to",
    "GONNA": "going to",
    "LOL": "laughing out loud",
    "BTW": "by the way",
    "CU": "see you",
    "OMG": "oh my god",
    "IDK": "I don't know",
    "PLS": "please"
}

def conversion_chat(text):
  new_text = []
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

df['review'] = df['review'].apply(lambda x: conversion_chat(x))

In [None]:
# This process takes more time
# spelling corrections

# from textblob import TextBlob

# df['review'] = df['review'].apply(lambda x: str(TextBlob(x).correct()))


In [None]:
# removing stopwords

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Tokenization using spacy

# import spacy
# nlp = spacy.load('en_core_web_sm')

# def tokenization_spacy(text):
#   return [token.text for token in nlp(text)]

# df['review'] = df['review'].apply(tokenization_spacy)


In [None]:
# Stemming

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stemming(text):
  return [ps.stem(word) for word in text]

df['review'].apply(stemming)


Unnamed: 0,review
0,"[o, n, e, , o, f, , t, h, e, , o, t, h, e, ..."
1,"[a, , w, o, n, d, e, r, f, u, l, , l, i, t, ..."
2,"[i, , t, h, o, u, g, h, t, , t, h, i, s, , ..."
3,"[b, a, s, i, c, a, l, l, y, , t, h, e, r, e, ..."
4,"[p, e, t, t, e, r, , m, a, t, t, e, i, s, , ..."
...,...
49995,"[i, , t, h, o, u, g, h, t, , t, h, i, s, , ..."
49996,"[b, a, d, , p, l, o, t, , b, a, d, , d, i, ..."
49997,"[i, , a, m, , a, , c, a, t, h, o, l, i, c, ..."
49998,"[i, m, , g, o, i, n, g, , t, o, , h, a, v, ..."


In [None]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive


In [None]:
# Problem 2

# Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

def word_count(text):
  total_words = len(text)
  unique_words = len(set(text))
  return total_words, unique_words

# Get word counts
# Apply the word_count function to each review and sum the results
total_words = df['review'].apply(lambda x: word_count(x)[0]).sum()  # Sum total words from each review
unique_words = df['review'].apply(lambda x: word_count(x)[1]).sum() # Sum unique words from each review


print("Total words in corpus:", total_words)
print("Unique words (vocabulary):", unique_words)


Total words in corpus: 62197845
Unique words (vocabulary): 1364082


In [None]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive


In [None]:
# Problem 3

# Apply One Hot Encoding

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


# Join the stemmed words back into a single string for each review
df['review1'] = df['review'].apply(lambda x: ' '.join(x))

ohe.fit_transform(df[['review1']])


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Problem 4

# Apply bag words and find the vocabulary also find the times each word has occured

from collections import Counter

def bag_of_words(text):
  word_counts = Counter(text)
  vocabulary = list(word_counts.keys())
  return word_counts, vocabulary

df['review'].apply(bag_of_words)


Unnamed: 0,review
0,"({'o': 102, 'n': 80, 'e': 153, ' ': 300, 'f': ..."
1,"({'a': 57, ' ': 155, 'w': 16, 'o': 50, 'n': 50..."
2,"({'i': 62, ' ': 161, 't': 71, 'h': 40, 'o': 50..."
3,"({'b': 8, 'a': 55, 's': 39, 'i': 50, 'c': 12, ..."
4,"({'p': 18, 'e': 141, 't': 113, 'r': 53, ' ': 2..."
...,...
49995,"({'i': 77, ' ': 189, 't': 82, 'h': 33, 'o': 70..."
49996,"({'b': 12, 'a': 44, 'd': 20, ' ': 107, 'p': 12..."
49997,"({'i': 77, ' ': 224, 'a': 80, 'm': 25, 'c': 30..."
49998,"({'i': 74, 'm': 30, ' ': 211, 'g': 20, 'o': 70..."


In [None]:
# Problem 5

# Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

from collections import Counter
from nltk import ngrams

def bag_of_ngrams(text, n):
  ngram_counts = Counter(ngrams(text, n))
  vocabulary = list(ngram_counts.keys())
  return ngram_counts, vocabulary


# Apply Bag of Bigrams (2-grams) and Trigrams (3-grams)
bigram_counts = bag_of_ngrams(df['review'], 2)
trigram_counts =bag_of_ngrams(df['review'], 3)



In [None]:

print("Bag of Bigrams (2-grams) Vocabulary:", bigram_counts[1])
print("Bag of Trigrams (3-grams) Vocabulary:", trigram_counts[1])

In [None]:
# Problem 6

# Apply tf-idf and find out the idf scores of words, also find out the vocabulary.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()  # No stop words removal
tfidf_matrix = tfidf.fit_transform(df['review'])


print("Vocabulary:", tfidf.get_feature_names_out())
print("IDF Scores:", tfidf.idf_)


Vocabulary: ['00' '000' '0000000000001' ... 'þór' 'יגאל' 'כרמון']
IDF Scores: [ 9.87388814 10.721186   11.1266511  ... 11.1266511  11.1266511
 11.1266511 ]
