# import Libraris and NLTK Packages

In [3]:
# import libraries
import pandas as pd

import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer


In [9]:
# download nltk corpus (first time only)
import nltk

nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data] 

True

In [10]:
# Load the amazon review dataset
# reviewText,Positive
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')
df

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1
...,...,...
19995,this app is fricken stupid.it froze on the kin...,0
19996,Please add me!!!!! I need neighbors! Ginger101...,1
19997,love it! this game. is awesome. wish it had m...,1
19998,I love love love this app on my side of fashio...,1


# Data Preprocessing

## Case Folding

In [11]:
#case Folding
df['reviewText'] = df['reviewText'].str.lower()

## Punctuation Removal

In [12]:
#Punctuation Removal
import string
df['reviewText'] = df['reviewText'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))


## Tokenization

In [13]:
df['reviewText'] = df['reviewText'].apply(lambda x: word_tokenize(x))

## Stop word Removal

In [14]:
#nltk.download('stopwords')
#from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Lematization

In [None]:
lemmatizer = WordNetLemmatizer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Or Apply this function that include all previous steps

In [None]:
# # create preprocess_text function
# def preprocess_text(text):

#     # Tokenize the text

#     tokens = word_tokenize(text.lower())




#     # Remove stop words

#     filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]




#     # Lemmatize the tokens

#     lemmatizer = WordNetLemmatizer()

#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]




#     # Join the tokens back into a string

#     processed_text = ' '.join(lemmatized_tokens)

#     return processed_text

# apply the function df

# df['reviewText'] = df['reviewText'].apply(preprocess_text)
# df

# Seniment Analysis

In [None]:
# initialize NLTK sentiment analyzer

analyzer = SentimentIntensityAnalyzer()

In [None]:
# create get_sentiment function

def get_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['pos'] > 0 else 0

    return sentiment

# apply get_sentiment function
df['scores']= df['reviewText'].apply(analyzer.polarity_scores)

df['sentiment'] = df['reviewText'].apply(get_sentiment)

df

Unnamed: 0,reviewText,Positive,scores,sentiment
0,one best apps acording bunch people agree bomb...,1,"{'neg': 0.146, 'neu': 0.548, 'pos': 0.306, 'co...",1
1,pretty good version game free lot different le...,1,"{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'comp...",1
2,really cool game bunch level find golden egg s...,1,"{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'comp...",1
3,silly game frustrating lot fun definitely reco...,1,"{'neg': 0.154, 'neu': 0.16, 'pos': 0.686, 'com...",1
4,terrific game pad hr fun grandkids love great ...,1,"{'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'comp...",1
...,...,...,...,...
19995,app fricken stupidit froze kindle wont allow p...,0,"{'neg': 0.353, 'neu': 0.647, 'pos': 0.0, 'comp...",0
19996,please add need neighbor ginger1016 thanks bun...,1,"{'neg': 0.0, 'neu': 0.398, 'pos': 0.602, 'comp...",1
19997,love game awesome wish free stuff house didnt ...,1,"{'neg': 0.21, 'neu': 0.233, 'pos': 0.556, 'com...",1
19998,love love love app side fashion story fight wo...,1,"{'neg': 0.103, 'neu': 0.322, 'pos': 0.575, 'co...",1


In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['Positive'], df['sentiment']))

[[ 1151  3616]
 [  605 14628]]


In [None]:
from sklearn.metrics import classification_report

print(classification_report(df['Positive'], df['sentiment']))

              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

