In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec,KeyedVectors
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from bs4 import BeautifulSoup
import gensim
import warnings
warnings.filterwarnings('ignore')

# **Exploring the data**

In [2]:
df = pd.read_csv(r'Data\all_kindle_review .csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'asin', 'helpful', 'rating', 'reviewText',
       'reviewTime', 'reviewerID', 'reviewerName', 'summary',
       'unixReviewTime'],
      dtype='object')

In [5]:
df = df[['reviewText','rating']]

In [6]:
df.shape

(12000, 2)

In [7]:
df.isna().sum()

reviewText    0
rating        0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  12000 non-null  object
 1   rating      12000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 187.6+ KB


In [10]:
# formating out in Postive or negative review
df['rating'] = df['rating'].apply(lambda x: 0 if x<3 else 1)

# **Cleaning and Pre-processing of the Data**

In [11]:
# Cleaning using Regular expression
def clean_text(text):
    # Removing Special Characters
    text = re.sub('[^a-zA-z0-9]+',' ',text)
    # Removing URL
    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)
    # Removing html tags
    text = BeautifulSoup(text,'lxml').get_text()
    return text

In [12]:
df['reviewText'] = df['reviewText'].apply(clean_text)

**Removing the stopwords**

In [13]:
def pre_processing_text(text):
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)

    return text


In [14]:
df['reviewText'] = df['reviewText'].apply(pre_processing_text)

In [17]:
df.reviewText[0]

'Jace Rankin may short nothing mess man hauled saloon undertaker knows He famous bounty hunter Oregon 1890s shot man saloon finished years long quest avenge sister murder trying figure next When snotty nosed farm boy rescued gang bullies offers money kill man forced ranch reluctantly agrees bring man justice kill outright But first needs tell sister widower news Kyla Kyle Springer Bailey riding trails sleeping ground past month trying find Jace She wants revenge man killed husband took ranch amongst crimes keen detour Jace wants take But realizes options hides behind boy persona best tries keep pace When confrontation along way gets shot Jace discovers Kyle Kyla come clean whole reason needs scoundrel dead hope still help The book share touching moments slow blooming romance Kyla find good reason fear men hide behind boy persona Watching Jace slowly pull shell help conquer fears endearing Her pain real deeply rooted disappear face sexiness Neither understandable aversion marriage magic

**Converting Sentences to Tokens**

In [18]:
tokens = []

for sent in df['reviewText']:
    sent_token = sent_tokenize(sent)
    for word in sent_token:
        tokens.append(simple_preprocess(word))

In [19]:
tokens[0]

['jace',
 'rankin',
 'may',
 'short',
 'nothing',
 'mess',
 'man',
 'hauled',
 'saloon',
 'undertaker',
 'knows',
 'he',
 'famous',
 'bounty',
 'hunter',
 'oregon',
 'shot',
 'man',
 'saloon',
 'finished',
 'years',
 'long',
 'quest',
 'avenge',
 'sister',
 'murder',
 'trying',
 'figure',
 'next',
 'when',
 'snotty',
 'nosed',
 'farm',
 'boy',
 'rescued',
 'gang',
 'bullies',
 'offers',
 'money',
 'kill',
 'man',
 'forced',
 'ranch',
 'reluctantly',
 'agrees',
 'bring',
 'man',
 'justice',
 'kill',
 'outright',
 'but',
 'first',
 'needs',
 'tell',
 'sister',
 'widower',
 'news',
 'kyla',
 'kyle',
 'springer',
 'bailey',
 'riding',
 'trails',
 'sleeping',
 'ground',
 'past',
 'month',
 'trying',
 'find',
 'jace',
 'she',
 'wants',
 'revenge',
 'man',
 'killed',
 'husband',
 'took',
 'ranch',
 'amongst',
 'crimes',
 'keen',
 'detour',
 'jace',
 'wants',
 'take',
 'but',
 'realizes',
 'options',
 'hides',
 'behind',
 'boy',
 'persona',
 'best',
 'tries',
 'keep',
 'pace',
 'when',
 'confr

**Loading the Google Word2Vec Model**

In [20]:
# Loading the Google Word2Vec Model
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [21]:
len(wv.index_to_key)

3000000

In [22]:
# Function to compute the average Word2Vec for a sentence
def avg_word2vec(sentence, model, vector_size=300):
    word_vectors = []
    
    for word in sentence:
        if word in model:  # Only include words that are in the Word2Vec model
            word_vectors.append(model[word])
    
    if len(word_vectors) > 0:
        # Return the average of the word vectors for the sentence
        return np.mean(word_vectors, axis=0)
    else:
        # If the sentence has no words in the Word2Vec model, return a zero vector
        return np.zeros(vector_size)

In [23]:
len(tokens)

12000

In [24]:
# Initialize an empty list to hold the average vectors for each sentence
X = []

# Loop through each tokenized sentence
for sentence in tokens:
    X.append(avg_word2vec(sentence, wv))

In [25]:
X

[array([ 8.00226554e-02,  6.52641580e-02, -1.53203250e-03,  4.67779897e-02,
        -4.27937061e-02,  1.51257338e-02,  4.29232791e-02, -6.61246553e-02,
         8.29041302e-02,  8.52462351e-02,  2.36139651e-02, -1.16603628e-01,
        -2.78051477e-02,  3.48806456e-02, -1.02703698e-01,  6.61900640e-02,
         3.46204676e-02,  1.13173716e-01,  2.51467209e-02, -6.93853870e-02,
         3.22188088e-03,  4.82136384e-02,  3.41417082e-02, -9.56520066e-03,
         3.62674594e-02, -6.92734271e-02, -6.89603984e-02,  6.78701177e-02,
         5.61628751e-02, -2.57657468e-02, -1.93648189e-02,  1.04102530e-02,
        -2.47662682e-02,  3.23101915e-02,  1.32709593e-02, -4.25659306e-03,
         7.94162303e-02, -1.87949296e-02,  2.04083379e-02,  8.69636014e-02,
         9.01506022e-02, -6.53589591e-02,  1.14224374e-01, -1.68848522e-02,
        -1.45413755e-02, -3.27903368e-02, -4.88894247e-02,  1.07868295e-02,
         3.03750709e-02,  1.18999062e-02, -4.98480648e-02,  2.79133860e-02,
        -1.1

In [26]:
X_new = np.array(X)

In [27]:
X_new.shape

(12000, 300)

In [28]:
new_df = pd.DataFrame(X_new)

In [29]:
new_df['output'] = df['rating']

In [30]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,output
0,0.080023,0.065264,-0.001532,0.046778,-0.042794,0.015126,0.042923,-0.066125,0.082904,0.085246,...,0.029027,-0.104945,-0.002838,-0.051196,-0.023896,0.023903,-0.058383,0.041375,0.011121,1
1,0.057465,0.051263,-0.026347,0.089272,-0.034534,-0.058502,0.063285,-0.026261,0.062996,0.104598,...,0.061467,-0.090006,0.01824,-0.079585,-0.035683,0.034177,-0.042923,-0.000915,-0.038909,1
2,0.030613,0.025742,0.004779,0.093128,-0.043249,-0.00676,-0.0041,-0.054144,0.081229,0.048086,...,0.050598,-0.10178,-0.026189,0.001709,-0.037625,0.002282,-0.012232,0.011513,-0.033229,1
3,0.091623,0.034305,0.037029,0.071674,-0.055564,0.044461,0.083785,0.009285,0.061512,0.08741,...,-0.016787,-0.066729,0.024738,-0.052554,-0.075336,0.020471,-0.048448,0.043557,0.04167,1
4,0.042953,0.053492,-0.037432,0.141052,-0.00116,-0.030052,0.14502,0.000656,0.084297,0.040771,...,0.038158,-0.070282,0.000774,0.10849,-0.028992,0.062061,0.017662,0.054092,-0.04858,1


**Exporting data**

In [32]:
new_df.to_csv('pre_processed_data.csv',index=False)

# **Thank you**