# Imports

In [270]:
import numpy as np 
import pandas as pd
import os
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

# Read dataset

In [271]:
df=pd.read_csv("/kaggle/input/movie-review/movie_review.csv")
df.drop(["fold_id","cv_tag","html_id","sent_id"],axis=1,inplace=True)
df.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


# Pre-processing des données textuelles :

In [272]:
def Preprocessing(df,colname):
    STOPWORDS=set(stopwords.words('english'))
    PUNCT_TO_REMOVE = string.punctuation
    
    df[colname] = df[colname].str.lower()
    df[colname] = df[colname].apply(lambda x: " ".join([word for word in str(x).split() if word not in STOPWORDS]))
    df[colname] = df[colname].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))

Preprocessing(df,'text')
df.head()

Unnamed: 0,text,tag
0,films adapted comic books plenty success whet...,pos
1,starters created alan moore eddie campbell ...,pos
2,say moore campbell thoroughly researched subje...,pos
3,book graphic novel 500 pages long include...,pos
4,words dismiss film source,pos


# Entraînement du modèle Word2Vec :

In [273]:
text_list = df['text'].apply(lambda x: x.split()).tolist()
model = Word2Vec(text_list, vector_size=100, window=5, min_count=1, workers=4)
print(model.wv)

KeyedVectors<vector_size=100, 47498 keys>


# Vectorisation des reviews de movies :

In [274]:
reviews_list = df['text'].apply(lambda x: x.split()).tolist()
def moyenne_Word2Vec(review,model,vector_size):
    vectors =[]
    for token in review:
        if token in model.wv:
            vectors.append(model.wv[token])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

review_vectors = [moyenne_Word2Vec(tokens,model,vector_size=100) for tokens in reviews_list]

# Division des données :

In [275]:
X=review_vectors
Y = df['tag']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

# Construction d&#39;un classificateur :

In [276]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Évaluation du modèle :

In [277]:
logistic_model.predict(X_test)

array(['neg', 'pos', 'pos', ..., 'pos', 'pos', 'pos'], dtype=object)

In [278]:
logistic_model.score(X_test,y_test)

0.5698393077873919

In [280]:
y_pred=logistic_model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

precision = precision_score(y_test,y_pred,pos_label='pos')  
print("Precision:",precision)

recall = recall_score(y_test,y_pred,pos_label='pos')
print("Recall:",recall)

f1 = f1_score(y_test, y_pred,pos_label='pos')
print("F1-score:",f1)

Accuracy: 0.5698393077873919
Precision: 0.565631843359526
Recall: 0.6666160619401852
F1-score: 0.61198606271777
