# Importation des bibliothéques

In [1]:
import utils as ut
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, f1_score

import numpy as np
import pandas as pd
import os

[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anyes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Chargement des données Train

In [2]:
path = "../datasets/movies/movies1000/"
alltxts,alllabs = ut.load_movies(path)

In [3]:
movies_df = pd.DataFrame()
movies_df['text'] = alltxts
movies_df['label'] = alllabs

movies_df

Unnamed: 0,text,label
0,the kids in the hall are an acquired taste . \...,0
1,capsule : a science fiction allegory . \nat th...,0
2,there is a rule when it comes to movies . \na ...,0
3,it's amazing how a comedian can have the some ...,0
4,"absolute power , the new film produced and dir...",0
...,...,...
1995,"as i walked out of crouching tiger , hidden dr...",1
1996,"when andy leaves for cowboy camp , his mother ...",1
1997,plot : a bunch of bad guys dressed up as elvis...,1
1998,trees lounge is the directoral debut from one ...,1


# Chargement des données Test

In [4]:
path = "../datasets/movies/testSentiment.txt"
movies_test = ut.load_movies_test(path)

In [5]:
print(len(movies_test))

25000


# TF-IDF

In [6]:
preprocessor = lambda doc: ut.lemmatization(ut.suppression_balises_html( ut.remove_numbers(ut.remove_ponctuation(doc)) ) )
tfidf_params = {
    'max_df': 0.3, 
    'min_df': 5, 
    'ngram_range': (1, 2), 
    'binary': True, 
    'lowercase': False, 
    'use_idf': True, 
    'sublinear_tf': True, 
    'max_features': 20000
}
xgb_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 6
}


In [7]:
tf_idf_vect = TfidfVectorizer(preprocessor=preprocessor , **tfidf_params)

X_train = tf_idf_vect.fit_transform(movies_df.text)
y_train = movies_df.label

X_test = tf_idf_vect.transform(movies_test)

# Regression Logistic

In [8]:
lr_cls = XGBClassifier(**xgb_params)
lr_cls.fit(X_train , y_train)

# Prédiction

In [9]:
y_pred = lr_cls.predict(X_test)

In [10]:

y_pred_N_P = np.where(y_pred == 1 , 'P' , 'N')

np.savetxt("./predictions/pred_v2.txt", y_pred_N_P, fmt='%s')
