In [4]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


In [5]:

imdb_data = pd.read_csv('imdb_dataset.csv')


vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(imdb_data['review']).toarray()
y = imdb_data['sentiment']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [6]:
sentences = [review.split() for review in imdb_data['review']]
word2vec_model = Word2Vec(sentences, min_count=1)



In [7]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector


In [8]:
train_wordvec_arrays = np.zeros((len(X_train), 100), dtype="float64")
test_wordvec_arrays = np.zeros((len(X_test), 100), dtype="float64")
vocab = set(word2vec_model.wv.key_to_index.keys())
for i in range(len(X_train)):
    train_wordvec_arrays[i] = average_word_vectors(X_train[i], word2vec_model, vocab, 100)
for i in range(len(X_test)):
    test_wordvec_arrays[i] = average_word_vectors(X_test[i], word2vec_model, vocab, 100)


In [9]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(train_wordvec_arrays, y_train)


In [10]:
y_pred = clf.predict(test_wordvec_arrays)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4961
