# **Sentiment Analysis (Text Classification)**
*   **Downloading Datset from Kaggle to Google Colab**
*   **Text Cleaning**
*   **Text Preprocessing**
*   **Feature Engineering**
*   **ML Model**

In [17]:
#!/bin/bash
!pip install kaggle

import os
import json

# Set up Kaggle API credentials
#os.environ['KAGGLE_CONFIG_DIR'] = "/content"
#/content/kaggle.json
# Make the Kaggle API key available to the environment
with open('/content/kaggle.json') as f:
    kaggle_json = json.load(f)
    os.environ['KAGGLE_USERNAME'] = kaggle_json['username']
    os.environ['KAGGLE_KEY'] = kaggle_json['key']

#!/bin/bash
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

!unzip imdb-dataset-of-50k-movie-reviews.zip

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  imdb-dataset-of-50k-movie-reviews.zip
replace IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


# **Importing Preprocessing Libraries**

In [18]:
import pandas as pd
import string


import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')


stopwords.words('english')
exclude = string.punctuation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# **Reading Data**

In [19]:
temp_df = pd.read_csv('/content/IMDB Dataset.csv')
df = temp_df.iloc[:30000]

# **Text Cleaning & Preprocessing**




In [20]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

#exclude = "!.,?"
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [21]:
df['review'] = df['review'].str.lower()

df['review'] = df['review'].apply(remove_html_tags)

df['review'] = df['review'].apply(remove_url)

df['review'] = df['review'].apply(remove_punc)

#df['review'] = df['review'].apply(word_tokenize)

#df['review'] = df['review'].apply(remove_stopwords)

#df['review'] = df['review'].apply(lemmatize_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_html_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_url)
A value is trying to be set on a copy of a slice from a DataFrame.


# **Feature Engineering**

**Target Column Encoding**

In [22]:
from sklearn.preprocessing import LabelEncoder

#X = df.drop('sentiment', axis=1)
X = df['review']
Y = df['sentiment']

print(X)
print(Y)

encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

print(Y)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
29995    new york i love you finally makes it to our sh...
29996    this movie makes you wish imdb would let you v...
29997    space camp which had the unfortunate luck to b...
29998    octavio paz mexican poet writer and diplomat w...
29999    having watched 10 minutes of this movie i was ...
Name: review, Length: 30000, dtype: object
0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
29995    positive
29996    negative
29997    negative
29998    positive
29999    negative
Name: sentiment, Length: 30000, dtype: object
[1 1 1 ... 0 1 0]


**Bag of Words**

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

print(X_train.shape)
#print(X_train.head)

#print(X_train)
#print(X_test)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_bow = vectorizer.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_bow.shape}")
print(f"Shape of X_test_bow: {X_test_bow.shape}")

# Applying Random Forest Classifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
#accuracy_score(y_test,y_pred)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

(24000,)
Shape of X_train_bow: (24000, 139736)
Shape of X_test_bow: (6000, 139736)
0.8456666666666667
[[2549  480]
 [ 446 2525]]


**n-gram (2-gram)**

In [24]:
cv = CountVectorizer(ngram_range=(2,2))

X_train_n_gram = cv.fit_transform(X_train)
X_test_n_gram = cv.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_n_gram.shape}")
print(f"Shape of X_test_bow: {X_test_n_gram.shape}")

rf = RandomForestClassifier()

rf.fit(X_train_n_gram,y_train)
y_pred = rf.predict(X_test_n_gram)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

Shape of X_train_bow: (24000, 1500023)
Shape of X_test_bow: (6000, 1500023)
0.8305
[[2508  521]
 [ 496 2475]]


**TF/IDF**

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_tfidf.shape}")
print(f"Shape of X_test_bow: {X_test_tfidf.shape}")

rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

Shape of X_train_bow: (24000, 139736)
Shape of X_test_bow: (6000, 139736)
0.8356666666666667
[[2539  490]
 [ 496 2475]]


# **Task:**
*   **Add a Python Function for Word-based Tokenization for each of the IMDB reviews data.**
*   **After tokenization, add a Python Function to remove Stop Words from the IMDB reviews data.**
*   **After Stopword Removal, add a Python Function to perform Lemmitization over IMDB Reviews data.**

**After applying the above mentioned data preprocessing steps, again run this code and analyse the performance of the ML models for text classification of IMDB Reviews.**




# **Reading Data**

In [38]:
temp_df = pd.read_csv('/content/IMDB Dataset.csv')
df = temp_df.iloc[:30000]

# **Text Cleaning & Preprocessing**




In [39]:
# Function to remove HTML tags
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

# Function to remove URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

# Function to remove punctuation
def remove_punc(text, exclude="!.,?"):
    return text.translate(str.maketrans('', '', exclude))

# Function for Word-based Tokenization
def tokenize_words(text):
    return word_tokenize(text)

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word not in stop_words]

# Function for Lemmatization
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]



In [40]:

# Apply transformations to the review column
df['review'] = df['review'].str.lower()  # Convert to lowercase

df['review'] = df['review'].apply(remove_html_tags)  # Remove HTML tags
df['review'] = df['review'].apply(remove_url)  # Remove URLs
df['review'] = df['review'].apply(remove_punc)  # Remove punctuation

df['review'] = df['review'].apply(tokenize_words)  # Tokenize words
df['review'] = df['review'].apply(remove_stopwords)  # Remove stopwords
df['review'] = df['review'].apply(lemmatize_words)  # Lemmatize words



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].str.lower()  # Convert to lowercase
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_html_tags)  # Remove HTML tags
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_url)  # Remove URLs
A value 

# **Feature Engineering**

**Target Column Encoding**

In [41]:
from sklearn.preprocessing import LabelEncoder

#X = df.drop('sentiment', axis=1)
X = df['review']
Y = df['sentiment']

print(X)
print(Y)

encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

print(Y)

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, 's, family, little, boy, (, jake, ...
4        [petter, mattei, 's, ``, love, time, money, ''...
                               ...                        
29995    [new, york, love, finally, make, shore, 10, sh...
29996    [movie, make, wish, imdb, would, let, vote, ze...
29997    [space, camp, unfortunate, luck, planned, arou...
29998    [octavio, paz, mexican, poet, writer, diplomat...
29999    [watched, 10, minute, movie, bewildered, watch...
Name: review, Length: 30000, dtype: object
0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
29995    positive
29996    negative
29997    negative
29998    positive
29999    negative
Name: sentiment, Length: 30000, dtype: object
[1 1 1 ... 0 1 0]


**Bag of Words**

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

print(X_train.shape)
#print(X_train.head)

#print(X_train)
#print(X_test)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_bow = vectorizer.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_bow.shape}")
print(f"Shape of X_test_bow: {X_test_bow.shape}")

# Applying Random Forest Classifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
#accuracy_score(y_test,y_pred)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

(24000,)


AttributeError: 'list' object has no attribute 'lower'

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Ensure X and Y are properly defined before this
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert lists of tokens back to strings if necessary
X_train = [' '.join(text) if isinstance(text, list) else str(text) for text in X_train]
X_test = [' '.join(text) if isinstance(text, list) else str(text) for text in X_test]

print(X_train[:5])  # Preview the cleaned training data

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_bow = vectorizer.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_bow.shape}")
print(f"Shape of X_test_bow: {X_test_bow.shape}")

# Apply Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict on the test set
y_pred = rf.predict(X_test_bow)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


["ever sit movie 's 's like one big `` wtf '' welcome decoy another straight video action fodder flick immediately forget watched better yet n't watch peter weller robert patrick star quickly wasted going nowhere fast mercenaries-for-hire action dud story pretty darn bad action suck 's point watching action flick action blow robert patrick particular hit new low action sequence firing machine gun standing hood moving school bus co-star ambient charlotte lewis canada 's scott hylands ( tv 's night heat fame )", "ok bought film woolworth friend joke present birthday front cover sexual innuendo itbut decided watch anyway hilarity purposesand 'm sorry got one worst film historyit began alright thought `` ok might actually ok '' 10 minute sadly mistakenit began `` mysterious paint baller '' turned obvious character scouser/australian ( say accent could n't identified ) 's acting might say abysmalthen got end time lost live paint ball finalsthe thing like plot n't actually win annoyingly eno

**n-gram (2-gram)**

In [44]:
cv = CountVectorizer(ngram_range=(2,2))

X_train_n_gram = cv.fit_transform(X_train)
X_test_n_gram = cv.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_n_gram.shape}")
print(f"Shape of X_test_bow: {X_test_n_gram.shape}")

rf = RandomForestClassifier()

rf.fit(X_train_n_gram,y_train)
y_pred = rf.predict(X_test_n_gram)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

Shape of X_train_bow: (24000, 1676542)
Shape of X_test_bow: (6000, 1676542)
0.7893333333333333
[[2151  878]
 [ 386 2585]]


**TF/IDF**

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Output the shapes of the resulting Bag of Words matrices
print(f"Shape of X_train_bow: {X_train_tfidf.shape}")
print(f"Shape of X_test_bow: {X_test_tfidf.shape}")

rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

Shape of X_train_bow: (24000, 105046)
Shape of X_test_bow: (6000, 105046)
0.8508333333333333
[[2589  440]
 [ 455 2516]]
