In [1]:
import pandas as pd

df = pd.read_csv("Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
#only preserve the first 10000 rows
df = df[:10000]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Id                      10000 non-null  int64 
 1   ProductId               10000 non-null  object
 2   UserId                  10000 non-null  object
 3   ProfileName             10000 non-null  object
 4   HelpfulnessNumerator    10000 non-null  int64 
 5   HelpfulnessDenominator  10000 non-null  int64 
 6   Score                   10000 non-null  int64 
 7   Time                    10000 non-null  int64 
 8   Summary                 10000 non-null  object
 9   Text                    10000 non-null  object
dtypes: int64(5), object(5)
memory usage: 781.4+ KB


In [3]:
#only preserve the 'Score' and 'Text' columns
df = df[['Score', 'Text']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Score   10000 non-null  int64 
 1   Text    10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [4]:
#check if there are any missing values
df.isnull().sum()

Score    0
Text     0
dtype: int64

In [5]:
# Convert the values in the "Score" column that are greater than or equal to 4 to 1, and the rest to 0 (1: positive, 0: negative)
df['Score'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)
df.head()

Unnamed: 0,Score,Text
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price. There was a wid...


In [6]:
#Split the text in the "Text" column using a delimiter
df['Text'] = df['Text'].str.split()
df.head()

Unnamed: 0,Score,Text
0,1,"[I, have, bought, several, of, the, Vitality, ..."
1,0,"[Product, arrived, labeled, as, Jumbo, Salted,..."
2,1,"[This, is, a, confection, that, has, been, aro..."
3,0,"[If, you, are, looking, for, the, secret, ingr..."
4,1,"[Great, taffy, at, a, great, price., There, wa..."


In [12]:
#Remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
df['Text'] = df['Text'].apply(lambda x: [word for word in x if word not in stop_words])
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win7-006\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Score,Text
0,1,"[I, bought, several, Vitality, canned, dog, fo..."
1,0,"[Product, arrived, labeled, Jumbo, Salted, Pea..."
2,1,"[This, confection, around, centuries., It, lig..."
3,0,"[If, looking, secret, ingredient, Robitussin, ..."
4,1,"[Great, taffy, great, price., There, wide, ass..."


In [15]:
#Text mining preprocessing, converting text into vectors, implement tf-idf (sklearn.feature_extraction.text.TfidfVectorizer)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()


df['Text'] = df['Text'].apply(lambda x: ' '.join(x))
df.head()

#Apply tf-idf to the "Text" column
tfidf_matrix = tfidf.fit_transform(df['Text'])
tfidf_matrix

<10000x18793 sparse matrix of type '<class 'numpy.float64'>'
	with 364923 stored elements in Compressed Sparse Row format>

In [16]:
#Use Random Forest Classifier (TF-IDF)
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(tfidf_matrix, df['Score'])

In [18]:
#Perform k-fold cross-validation and calculate the accuracy for k=4
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, tfidf_matrix, df['Score'], cv=4, scoring='accuracy')
print(f'Cross-validation scores: {scores}')
print(f'Average accuracy: {scores.mean():.4f}')

Cross-validation scores: [0.7872 0.7948 0.7936 0.7924]
Average accuracy: 0.7920
