In [1]:
!pip install beautifulsoup4 nltk scikit-learn gensim



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir("/content/drive/MyDrive/AI_Codding_exam/NLP_AI_Exam")

In [4]:
!dir

IMDB\ Dataset.csv  Prathamesh_Ambre_NLP_AI_Test.ipynb


#1)Preprocess the text (i.e., the steps required prior to converting the sentence into a vector) using any library of your choice.

## Step 1: Preprocess the text

In [5]:

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import numpy as np

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

##Function for cleaning the text

In [8]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a sentence
    clean_text = ' '.join(tokens)

    return clean_text

In [9]:
import pandas as pd

##Reading the data

In [10]:
# My dataset is in a CSV file named 'your_dataset.csv'
file_path = '/content/drive/MyDrive/AI_Codding_exam/NLP_AI_Exam/IMDB Dataset.csv'

In [11]:
# Load the entire dataset
data = pd.read_csv(file_path)

# Sample 10,000 records randomly
num_records_to_sample = 10000
df = data.sample(n=num_records_to_sample, random_state=42)

In [12]:
df.head()

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,positive
9427,Not many television shows appeal to quite as m...,positive
199,The film quickly gets to a major chase scene w...,negative
12447,Jane Austen would definitely approve of this o...,positive
39489,Expectations were somewhat high for me when I ...,negative


In [13]:
df["sentiment"].value_counts()

positive    5039
negative    4961
Name: sentiment, dtype: int64

In [14]:
df.shape

(10000, 2)

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
# Apply preprocessing to the 'review' column and create a new 'clean_review' column
df['clean_review'] = df['review'].apply(preprocess_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [17]:
df.head()

Unnamed: 0,review,sentiment,clean_review
33553,I really liked this Summerslam due to the look...,positive,really liked summerslam due look arena curtain...
9427,Not many television shows appeal to quite as m...,positive,many television shows appeal quite many differ...
199,The film quickly gets to a major chase scene w...,negative,film quickly gets major chase scene ever incre...
12447,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve one gwyne...
39489,Expectations were somewhat high for me when I ...,negative,expectations somewhat high went see movie thou...


#2)Given any sentence, perform vector semantics i.e., convert the given dataset into vectors using Bag of Words approach.

##Step 2: Vector semantics using Bag of Words (CountVectorizer or TF-IDF)

In [18]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_review']).toarray()
y = df['sentiment']

#3)Train a simple classifier (using Scikit-learn e.g.: SVM) to perform sentiment analysis on the generated dataset.

## Step 3: Train a simple classifier (SVM)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

###Finding Best hyperparameter using GridSearch

In [21]:
# # Optimize SVM hyperparameters using GridSearchCV
# param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
# svm_classifier = SVC()
# grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, n_jobs=-1)


# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_svm_classifier = grid_search.best_estimator_


# # Make predictions using the best model
# predictions = best_svm_classifier.predict(X_test)
# accuracy = accuracy_score(y_test, predictions)

# print(f"Best SVM hyperparameters: {grid_search.best_params_}")
# print(f"Accuracy using Bag of Words approach: {accuracy}")

In [22]:
# Set the best hyperparameters manually
best_svm_classifier = SVC(C=1, kernel='rbf')

In [23]:
# Fit the model with your training data
best_svm_classifier.fit(X_train, y_train)

In [24]:
# Make predictions using the best model
predictions = best_svm_classifier.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, predictions)

In [26]:
print(f"Accuracy using Bag of Words approach: {accuracy}")

Accuracy using Bag of Words approach: 0.872


Best SVM hyperparameters: {'C': 1, 'kernel': 'rbf'}
Accuracy using Bag of Words approach: 0.8725

#4)Repeat the above process again but now using word2vector. (for any sentence, take average of the vector representation of all the tokens to get the vector representation)

##Step 4: Vector semantics using Word2Vec

In [27]:
!pip install gensim



In [28]:
import gensim

In [29]:
from gensim.models import Word2Vec, KeyedVectors

In [30]:
#'clean_review' is the column containing preprocessed reviews
df['tokenized_sentences'] = df['clean_review'].apply(lambda x: word_tokenize(x))

In [31]:
# Filter out empty tokenized sentences
tokenized_sentences = df['tokenized_sentences'][df['tokenized_sentences'].apply(len) > 0]

In [32]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 5: Convert paragraphs to vectors using Word2Vec (average of token vectors)

In [33]:
def get_vector(sentence):
    vectors = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if not vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

In [34]:
# Assuming tokenized_sentences is a column in your DataFrame
df['word2vec_vector'] = df['tokenized_sentences'].apply(get_vector)

In [35]:
df.head()

Unnamed: 0,review,sentiment,clean_review,tokenized_sentences,word2vec_vector
33553,I really liked this Summerslam due to the look...,positive,really liked summerslam due look arena curtain...,"[really, liked, summerslam, due, look, arena, ...","[-0.26590616, 0.4218388, 0.23897374, 0.0712185..."
9427,Not many television shows appeal to quite as m...,positive,many television shows appeal quite many differ...,"[many, television, shows, appeal, quite, many,...","[-0.065519266, 0.6058577, 0.1863164, 0.2087049..."
199,The film quickly gets to a major chase scene w...,negative,film quickly gets major chase scene ever incre...,"[film, quickly, gets, major, chase, scene, eve...","[-0.36330926, 0.64236075, 0.22573566, 0.247581..."
12447,Jane Austen would definitely approve of this o...,positive,jane austen would definitely approve one gwyne...,"[jane, austen, would, definitely, approve, one...","[-0.28966907, 0.57714725, 0.26444474, 0.054151..."
39489,Expectations were somewhat high for me when I ...,negative,expectations somewhat high went see movie thou...,"[expectations, somewhat, high, went, see, movi...","[-0.24685362, 0.5009458, 0.2357432, 0.20990542..."


In [36]:
# Split data into train and test sets
X = np.vstack(df['word2vec_vector'].to_numpy())
y = df['sentiment']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Initialize SVM classifier
svm_classifier_word2vec = SVC(C=1, kernel='rbf')

In [39]:
# Fit the model with your training data
svm_classifier_word2vec.fit(X_train, y_train)

In [40]:
# Make predictions using the trained model
predictions_word2vec = svm_classifier_word2vec.predict(X_test)

In [41]:
# Calculate accuracy
accuracy_word2vec = accuracy_score(y_test, predictions_word2vec)
print(f"Accuracy using Word2Vec vectors: {accuracy_word2vec}")

Accuracy using Word2Vec vectors: 0.7995


##The effectiveness of Word2Vec embeddings often depends on the size and complexity of the dataset. In some cases, Word2Vec may outperform BoW on larger and more complex datasets where capturing semantic relationships is crucial.

##The SVM classifier might not be optimized for Word2Vec vectors. Hyperparameter tuning specific to Word2Vec embeddings or using other classification algorithms might yield better results.