In [38]:
!pip install beautifulsoup4 nltk scikit-learn gensim



In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
import os
os.chdir("/content/drive/MyDrive/AI_Codding_exam/NLP_AI_Exam")

In [41]:
!dir

IMDB\ Dataset.csv  Untitled0.ipynb


#1)Preprocess the text (i.e., the steps required prior to converting the sentence into a vector) using any library of your choice.

## Step 1: Preprocess the text

In [42]:

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import numpy as np

In [43]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

##Function for cleaning the text

In [45]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a sentence
    clean_text = ' '.join(tokens)

    return clean_text

In [46]:
import pandas as pd

##Reading the data

In [47]:
# Assuming your dataset is in a CSV file named 'your_dataset.csv'
file_path = '/content/drive/MyDrive/AI_Codding_exam/NLP_AI_Exam/IMDB Dataset.csv'

In [48]:
# Load the entire dataset
data = pd.read_csv(file_path)

# Sample 10,000 records randomly
num_records_to_sample = 10000
df = data.sample(n=num_records_to_sample, random_state=42)

In [49]:
df.head()

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,positive
9427,Not many television shows appeal to quite as m...,positive
199,The film quickly gets to a major chase scene w...,negative
12447,Jane Austen would definitely approve of this o...,positive
39489,Expectations were somewhat high for me when I ...,negative


In [50]:
df["sentiment"].value_counts()

positive    5039
negative    4961
Name: sentiment, dtype: int64

In [51]:
df.shape

(10000, 2)

In [52]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
# Apply preprocessing to the 'review' column and create a new 'clean_review' column
df['clean_review'] = df['review'].apply(preprocess_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [54]:
df.head()

Unnamed: 0,review,sentiment,clean_review
33553,I really liked this Summerslam due to the look...,positive,really liked summerslam due look arena curtain...
9427,Not many television shows appeal to quite as m...,positive,many television shows appeal quite many differ...
199,The film quickly gets to a major chase scene w...,negative,film quickly gets major chase scene ever incre...
12447,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve one gwyne...
39489,Expectations were somewhat high for me when I ...,negative,expectations somewhat high went see movie thou...


#2)Given any sentence, perform vector semantics i.e., convert the given dataset into vectors using Bag of Words approach.

##Step 2: Vector semantics using Bag of Words (CountVectorizer or TF-IDF)

In [55]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_review']).toarray()
y = df['sentiment']

#3)Train a simple classifier (using Scikit-learn e.g.: SVM) to perform sentiment analysis on the generated dataset.

## Step 3: Train a simple classifier (SVM)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [21]:
# Optimize SVM hyperparameters using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, n_jobs=-1)


In [22]:
grid_search.fit(X_train, y_train)

In [23]:
# Get the best hyperparameters
best_svm_classifier = grid_search.best_estimator_

In [24]:
# Make predictions using the best model
predictions = best_svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

In [25]:
print(f"Best SVM hyperparameters: {grid_search.best_params_}")
print(f"Accuracy using Bag of Words approach: {accuracy}")

Best SVM hyperparameters: {'C': 1, 'kernel': 'rbf'}
Accuracy using Bag of Words approach: 0.8725


Best SVM hyperparameters: {'C': 1, 'kernel': 'rbf'}
Accuracy using Bag of Words approach: 0.8725

#4)Repeat the above process again but now using word2vector. (for any sentence, take average of the vector representation of all the tokens to get the vector representation)

##Step 4: Vector semantics using Word2Vec

In [57]:
!pip install gensim



In [58]:
import gensim

In [59]:
from gensim.models import Word2Vec, KeyedVectors

In [65]:
#'clean_review' is the column containing preprocessed reviews
df['tokenized_sentences'] = df['clean_review'].apply(lambda x: word_tokenize(x))

In [66]:
# Filter out empty tokenized sentences
tokenized_sentences = df['tokenized_sentences'][df['tokenized_sentences'].apply(len) > 0]

In [67]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 5: Convert paragraphs to vectors using Word2Vec (average of token vectors)

In [68]:
def get_vector(sentence):
    vectors = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if not vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

In [70]:
# Assuming tokenized_sentences is a column in your DataFrame
df['word2vec_vector'] = df['tokenized_sentences'].apply(get_vector)

In [71]:
df.head()

Unnamed: 0,review,sentiment,clean_review,tokenized_sentences,word2vec_vector
33553,I really liked this Summerslam due to the look...,positive,really liked summerslam due look arena curtain...,"[really, liked, summerslam, due, look, arena, ...","[-0.26983044, 0.45476717, 0.123613775, 0.22531..."
9427,Not many television shows appeal to quite as m...,positive,many television shows appeal quite many differ...,"[many, television, shows, appeal, quite, many,...","[-0.18477127, 0.5940456, -0.008540287, 0.43033..."
199,The film quickly gets to a major chase scene w...,negative,film quickly gets major chase scene ever incre...,"[film, quickly, gets, major, chase, scene, eve...","[-0.39081207, 0.6416117, 0.006897853, 0.471406..."
12447,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve one gwyne...,"[jane, austen, would, definitely, approve, one...","[-0.35766566, 0.547833, 0.14666249, 0.23903182..."
39489,Expectations were somewhat high for me when I ...,negative,expectations somewhat high went see movie thou...,"[expectations, somewhat, high, went, see, movi...","[-0.31515294, 0.5276524, 0.07961337, 0.434966,..."
