In [None]:
import numpy as np
import pandas as pd
# read the dataframe
df = pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv")
df.head()

In [None]:
df.info()

In [None]:
# check for number of null values
df.isnull().sum()

In [None]:
# remove rows with null values
df.dropna(subset=['Summary'], inplace=True)
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# plot a histogram of Scores by count
sns.set_style('whitegrid')
sns.histplot(data=df, x='Score', discrete=True)
plt.title('Product Score')
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)
stopwords.update(["br", "href"])
text = " ".join(review for review in df.Text)
wordcloud = WordCloud(stopwords=stopwords).generate(text)

# plot wordcloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# assign reviews with Score > 3 as positive sentiment
# Score < 3 as negative sentiment
# remove Score = 3

df = df[df['Score'] != 3]
df['Sentiment'] = df['Score'].apply(lambda score: 1 if score > 3 else -1)
df.head()

In [None]:
# split the dataframe
positive = df[df['Sentiment'] == 1]
negative = df[df['Sentiment'] == -1]

In [None]:
stopwords = set(STOPWORDS)
# remove 'good' and 'great' since they are in negative reviews
stopwords.update(["br", "href","good","great"])

positive_summary = " ".join(review for review in positive.Summary)
positive_wordcloud = WordCloud(stopwords=stopwords).generate(positive_summary)

plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
negative_summary = " ".join(review for review in negative.Summary)
negative_wordcloud = WordCloud(stopwords=stopwords).generate(negative_summary)

plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
sns.histplot(data=df, x='Sentiment', discrete=True)
plt.xticks([-1, 1])
plt.show()

In [None]:
# remove punctuation from Summary and Text
def remove_punctuation(text):
    return "".join(c for c in text if c not in ("?", ".", ";", ":",  "!",'"'))

df['Text'] = df['Text'].apply(remove_punctuation)
df['Summary'] = df['Summary'].apply(remove_punctuation)

In [None]:
# split the train and test data
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [None]:
# convert the text into a bag-of-words model
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

train_matrix = vectorizer.fit_transform(train['Summary'])
test_matrix = vectorizer.transform(test['Summary'])

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')

X_train = train_matrix 
X_test = test_matrix
y_train = train['Sentiment']
y_test = test['Sentiment']

In [None]:
# fit the model on data and predict
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
# testing
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(predictions, y_test)

In [None]:
print(classification_report(predictions, y_test))