In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = pd.read_excel('/content/IMDB_Dataset_sample.xlsx')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [6]:
# Train the Word2Vec model
# we are calling each sentence from X_train
# we are spliting :  it becomes words
# we have 40k lists are there
# each list has words
sentences = [sentence.split() for sentence in X_train]
len(sentences)

891

In [7]:
len(sentences[0])

69

In [8]:

w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)
w2v_model
# default vector size =100

<gensim.models.word2vec.Word2Vec at 0x7d5d41bd3760>

In [9]:
 len(w2v_model.wv['king'])

100

In [10]:
w2v_model.corpus_total_words

108286

In [11]:
w2v_model.wv['good']

array([-0.47121397,  0.53321797,  0.17737779,  0.10502806,  0.18202044,
       -0.96742344,  0.21887799,  0.95305985, -0.43076065, -0.43004873,
       -0.43316227, -0.8889222 , -0.13134883,  0.24425916,  0.23004141,
       -0.4402946 ,  0.16121083, -0.50408846,  0.11796171, -0.9046688 ,
        0.37909985,  0.19123016,  0.54936516, -0.18014318, -0.12716937,
        0.29753062, -0.3910463 , -0.01090024, -0.50017756,  0.0937112 ,
        0.77407014,  0.1154747 ,  0.03816618, -0.5606551 , -0.21365766,
        0.4472977 , -0.10402785, -0.4047231 , -0.4090116 , -0.95374274,
        0.11766389, -0.49192047, -0.07744063, -0.06694867,  0.55791193,
       -0.32085195, -0.4461659 , -0.01064048,  0.19318528,  0.39537653,
        0.23345898, -0.6495619 , -0.29875797, -0.17107008, -0.45810527,
        0.46715084,  0.35699263,  0.13146962, -0.5240319 ,  0.16657676,
        0.32015938,  0.2257052 , -0.10219441,  0.07432225, -0.6735893 ,
        0.4548207 ,  0.10444452,  0.31848592, -0.5984299 ,  0.65

In [12]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [13]:
X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

In [14]:
X_train

array([[-0.30522043,  0.3433368 ,  0.10476334, ..., -0.48458737,
         0.15036666, -0.15692307],
       [-0.30324468,  0.34280136,  0.1033288 , ..., -0.48052835,
         0.14891775, -0.15569067],
       [-0.34640148,  0.3897256 ,  0.11938021, ..., -0.5505515 ,
         0.17121273, -0.17925556],
       ...,
       [-0.28481975,  0.31743425,  0.09643262, ..., -0.44734457,
         0.13829128, -0.14557323],
       [-0.29390788,  0.33164886,  0.10115124, ..., -0.46591237,
         0.14310603, -0.14947583],
       [-0.31120834,  0.35228005,  0.10843402, ..., -0.49303642,
         0.1531043 , -0.16033573]], dtype=float32)

In [15]:
len(X_train[0]),len(X_train)

(100, 891)

In [17]:
# Train a classification model
clf = LogisticRegression()
clf.fit(X_train, y_train)
# Evaluate the model
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label='positive'))
print('Recall:', recall_score(y_test, y_pred, pos_label='positive'))
print('F1 score:', f1_score(y_test, y_pred, pos_label='positive'))

Accuracy: 0.547085201793722
Precision: 0.5533980582524272
Recall: 0.5089285714285714
F1 score: 0.5302325581395348


In [18]:
X_test='movie direction is good, but not a good story'
X_test = preprocess(X_test)
X_test = np.array([vectorize(X_test)])
print(len(X_test))
clf.predict(X_test)

1


array(['negative'], dtype=object)

In [20]:
# problem ======= solution approach
# text  preprocess vectors

# vectors :  word2vec   glove  fasttext   bert   gpt