In [2]:
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
path = "C:/Users/Ahmad/Desktop/Python/data-analysis-projects/amazon_for_sentiment analysis.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [4]:
df.shape

(20000, 2)

In [5]:
# define a function to pre-process the review texts
def nlp_preprocess_text(text):
    # tokenize the text
    tokens = word_tokenize(text.lower())
    # remove stop tokens
    clean_texts = [token for token in tokens if token not in stopwords.words()]
    #lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in clean_texts]
    #join the tokens back together to create processed texts
    processed_text = ' '.join(lemmatized_tokens)
    #return the processed text
    return processed_text

In [7]:
new = df.copy()[:5000]
# apply the function on the texts column
new['reviewText'] = new['reviewText'].apply(nlp_preprocess_text)
new.to_csv('5000ReviewTokens.csv')

In [69]:
path2 = "C:/Users/Ahmad/Desktop/Python/data-analysis-projects/5000ReviewTokens.csv"
token_df = pd.read_csv(path2, index_col=0)
token_df.reset_index()
token_df.dropna(inplace=True, axis=0)
token_df.head()

Unnamed: 0,reviewText,Positive
0,apps acording bunch agree bomb egg pig tnt kin...,1
1,pretty version game free . lot different level...,1
2,cool game . bunch level find golden egg . supe...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . hr fun . grandkids love . ...,1


In [70]:
#create function to apply sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment

In [79]:
# Apply the function
token_df['sentiment'] = token_df['reviewText'].apply(get_sentiment)
token_df.head()

Unnamed: 0,reviewText,Positive,sentiment
0,apps acording bunch agree bomb egg pig tnt kin...,1,1
1,pretty version game free . lot different level...,1,1
2,cool game . bunch level find golden egg . supe...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . hr fun . grandkids love . ...,1,1


In [87]:
print(confusion_matrix(token_df.Positive, token_df.sentiment))

[[ 333  736]
 [ 345 3584]]


In [88]:
print(classification_report(token_df.Positive, token_df.sentiment))

              precision    recall  f1-score   support

           0       0.49      0.31      0.38      1069
           1       0.83      0.91      0.87      3929

    accuracy                           0.78      4998
   macro avg       0.66      0.61      0.63      4998
weighted avg       0.76      0.78      0.76      4998



In [105]:
from sklearn.feature_extraction.text import CountVectorizer

feature = token_df.reviewText
label = token_df.Positive

tfidf = CountVectorizer()
tfidf.fit(feature)
vectors = tfidf.transform(feature)
bow = pd.DataFrame(vectors.toarray(), columns=tfidf.get_feature_names_out())
bow.shape

(4998, 7165)

In [106]:
print(token_df.shape, bow.shape, label.shape)

(4998, 3) (4998, 7165) (4998,)


In [110]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


x = bow
y = label.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=123)
nb_model = MultinomialNB()

nb_model.fit(x_train, y_train)
predictions = nb_model.predict(x_test)

In [111]:
print(confusion_matrix(y_test, predictions))

[[ 177  118]
 [  93 1112]]


In [112]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.60      0.63       295
           1       0.90      0.92      0.91      1205

    accuracy                           0.86      1500
   macro avg       0.78      0.76      0.77      1500
weighted avg       0.86      0.86      0.86      1500

