### 1. TF-IDF

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

df = pd.read_csv("Reviews.csv")

#only preserve the first 10000 rows
df = df[:10000]

#only preserve the 'Score' and 'Text' columns
df = df[['Score', 'Text']]

# Convert the values in the "Score" column that are greater than or equal to 4 to 1, and the rest to 0 (1: positive, 0: negative)
df['Score'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

#Split the text in the "Text" column using a delimiter
df['Text'] = df['Text'].str.split()

#Remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
df['Text'] = df['Text'].apply(lambda x: [word for word in x if word not in stop_words])

#Text mining preprocessing, converting text into vectors, implement tf-idf (sklearn.feature_extraction.text.TfidfVectorizer)
tfidf = TfidfVectorizer()
df['Text'] = df['Text'].apply(lambda x: ' '.join(x))

#Apply tf-idf to the "Text" column
tfidf_matrix = tfidf.fit_transform(df['Text'])
tfidf_matrix

#Use Random Forest Classifier (TF-IDF)
clf = RandomForestClassifier()
clf.fit(tfidf_matrix, df['Score'])

#Perform k-fold cross-validation and calculate the accuracy for k=4
scores = cross_val_score(clf, tfidf_matrix, df['Score'], cv=4, scoring='accuracy')
print(f'Cross-validation scores: {scores}')
print(f'Average accuracy: {scores.mean():.4f}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win7-006\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cross-validation scores: [0.7904 0.792  0.7936 0.8012]
Average accuracy: 0.7943


### 2. Word2Vec

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Load the dataset
df = pd.read_csv("Reviews.csv")

# Preserve only the first 10000 rows and select 'Score' and 'Text' columns
df = df[['Score', 'Text']].head(10000)

# Convert 'Score' to binary (1 for positive, 0 for negative sentiment)
df['Score'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

# Download necessary NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocess the text (tokenization and stopword removal)
df['Text'] = df['Text'].apply(lambda x: [word for word in x.lower().split() if word not in stop_words])

# Train Word2Vec model on the tokenized text
w2v = Word2Vec(df['Text'], vector_size=100, window=5, min_count=2)

# Compute the average Word2Vec vector for each document
def document_vector(text):
    # Filter words that are in the Word2Vec model's vocabulary
    words = [word for word in text if word in w2v.wv]
    if len(words) == 0:  # If none of the words are in the vocabulary, return a zero vector
        return np.zeros(100)
    # Average the word vectors
    return np.mean(w2v.wv[words], axis=0)

# Apply the function to create a document vector for each review
w2v_matrix = np.vstack(df['Text'].apply(document_vector))

# Use Random Forest Classifier on the Word2Vec vectors
clf = RandomForestClassifier()
clf.fit(w2v_matrix, df['Score'])

# Perform 4-fold cross-validation and print the accuracy
scores = cross_val_score(clf, w2v_matrix, df['Score'], cv=4, scoring='accuracy')
print(f'Cross-validation scores: {scores}')
print(f'Average accuracy: {scores.mean():.4f}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win7-006\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cross-validation scores: [0.7676 0.7624 0.7548 0.7596]
Average accuracy: 0.7611
