Library imports

In [1]:
from logging import warning
import pandas as pd 
import numpy as np
import re
import nltk
import heapq
# NLP tools
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
# Machine Learning Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes  import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
# Hold out / Grid / Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
# Metrics 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
# .Config 
warning("ignore")
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rícharde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rícharde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Utilize o arquivo moviesreviews.tsv (sep = ‘\t’).

In [2]:
reviews_df = pd.read_csv(r"C:\Users\Rícharde\Documents\Dell lead\Atividade 06\moviereviews.tsv", sep='\t')

In [3]:
reviews_df.head(10)

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
5,neg,"to put it bluntly , ed wood would have been pr..."
6,neg,"synopsis : melissa , a mentally-disturbed woma..."
7,neg,tim robbins and martin lawernce team up in thi...
8,neg,"in "" gia "" , angelina jolie plays the titular ..."
9,neg,"in 1990 , the surprise success an unheralded l..."


# Faça pré-processamento dos dados.


DataSet analysys 

In [4]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [5]:
reviews_df.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
reviews_df['label'].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [7]:
# Null values position
# creating bool series True for NaN values 
bool_series = pd.isnull(reviews_df["review"]) 
    
# filtering data 
# displaying data only with Gender = NaN 
reviews_df[bool_series]

Unnamed: 0,label,review
140,pos,
208,pos,
270,neg,
334,neg,
448,neg,
522,neg,
606,pos,
696,neg,
728,pos,
738,neg,


In [8]:
reviews_df = reviews_df.dropna()
print(reviews_df.isnull().sum())

label     0
review    0
dtype: int64


In [9]:
reviews_df['label'].value_counts()

neg    983
pos    982
Name: label, dtype: int64

Label transform 

In [10]:
# Label transform 
data = LabelBinarizer().fit_transform(reviews_df["label"])
reviews_df["label"] = data

Tokenizing sentences

In [11]:
def Tokenizing_Sentences(Data,colun_term):
    sentences = Data[colun_term]
    Iloc = 0
    for sentece in sentences:
        tonken_text = word_tokenize(sentece,language="english",preserve_line=False)
        Data[colun_term].iloc[Iloc] = tonken_text
        Iloc+=1
    return Data

In [12]:
reviews_df = Tokenizing_Sentences(reviews_df,'review')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Data[colun_term].iloc[Iloc] = tonken_text


Removing Stop Words

In [13]:
stop_Words = set(stopwords.words("english"))

In [14]:
def Verify_Stop_Words(sentence):
    text = list(sentence)
    for word in stop_Words:    
        if text.__contains__(word):
            text.remove(word)
    return text

In [15]:
sentences = reviews_df["review"]
Iloc = 0
for sentence in sentences:
    text = Verify_Stop_Words(sentence)
    reviews_df["review"].iloc[Iloc] = text
    Iloc +=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["review"].iloc[Iloc] = text


Removing Punctuation and special caracter

In [16]:
sentences = reviews_df["review"]
Iloc = 0
for sentence in sentences:
    text = re.sub(r'[^\w\s]', '',str(sentence))
    reviews_df["review"].iloc[Iloc] = text
    Iloc +=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["review"].iloc[Iloc] = text


In [17]:
reviews_df

Unnamed: 0,label,review
0,0,films like mouse hunt get theatres nt law som...
1,0,talented actresses blessed demonstrated wide a...
2,1,extraordinary year australian films shine h...
3,1,according hollywood movies made last decades ...
4,0,first press screening 1998 already ve gotten p...
...,...,...
1995,1,like movies albert brooks i really like movie...
1996,1,might surprise know joel ethan coen brought u...
1997,1,verdict spinechilling drama horror maestro st...
1998,1,want correct i wrote former retrospective davi...


# Separe os dados em treino e teste.

In [18]:
# Data separation
X = reviews_df["review"]
y = reviews_df["label"]

# extração de características do texto utilizando BOW e TF-IDF

Word Count 

In [19]:
count_vec = CountVectorizer(ngram_range=(1,1))
X_count   = count_vec.fit_transform(X)

TF-IDF

In [20]:
tf = TfidfTransformer(norm="l1")
X_freq_count = tf.fit_transform(X_count)

# classificação dos dados

In [21]:
models = {}
models['kNN'] = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)
models['Random Forest'] = RandomForestClassifier(n_estimators=250,random_state=42)
models['Naive Bayes'] = GaussianNB()
models['MLP_1'] = MLPClassifier(hidden_layer_sizes=(30,),activation='relu')
models['MLP_2'] = MLPClassifier(hidden_layer_sizes=(30,30),activation='relu')
models['SVM'] = SVC()
models['DT'] = DecisionTreeClassifier(random_state=42,min_samples_split=10)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_freq_count,y,test_size=0.8,random_state=42,shuffle=True)

In [23]:
model_name   = []
result_acc   = []
result_f1    = []

for actual_model in models.keys(): 
    __some_model__ = models[actual_model].fit(X_train.toarray(),y_train)
    y_predict      = models[actual_model].predict(X_test.toarray())
    # score test 
    acc_scr    = accuracy_score(y_test,y_predict)
    f1_scr     = f1_score(y_test,y_predict,average='weighted')
    # savaing information 
    model_name.append(str(actual_model))
    result_acc.append(acc_scr)
    result_f1.append(f1_scr)



Results 

In [29]:
result = pd.DataFrame({'Model name': model_name, 'Accuracy':result_acc, 'F1-Score':result_f1})
result

Unnamed: 0,Model name,Accuracy,F1-Score
0,kNN,0.506361,0.35524
1,Random Forest,0.725827,0.725583
2,Naive Bayes,0.611323,0.608609
3,MLP_1,0.770992,0.770399
4,MLP_2,0.760178,0.757241
5,SVM,0.697201,0.695872
6,DT,0.591603,0.591192


# Otimize o Classificador

Loading hyperparameters

In [31]:
# Knn model
hyper_1 = {}
hyper_1 ['classifier__n_neighbors'] = [3,9,18,25,45,55]
hyper_1 ['classifier__leaf_size'] = [5,12,20,30]
hyper_1 ['classifier__algorithm'] = ['ball_tree','kd_tree','brute']
hyper_1['classifier'] = [models['kNN']]
# Random Forest model
hyper_2 = {}
hyper_2 ['classifier__n_estimators'] = [10,20,30,50,100,200,300]
hyper_2 ['classifier__criterion'] = ['gini', 'entropy']
hyper_2 ['classifier__max_depth'] = [5,20,50,80,100]
hyper_2['classifier'] = [models['Random Forest']]
# Naive Bayes modedl
hyper_3 = {}
hyper_3 ['classifier__var_smoothing'] = [0.00001,0.001,0.1,1.60,10]
hyper_3['classifier'] = [models['Naive Bayes']]
# MLP 1
hyper_4 = {}
hyper_4 ['classifier__hidden_layer_sizes'] = [(20,),(30,30),(100,40,20)]
hyper_4 ['classifier__activation'] = ['identity','logistic','tanh','relu']
hyper_4 ['classifier__alpha'] = [0000.1, 00.1, 1, 10]
hyper_4['classifier'] = [models['MLP_1']]
# MLP 2
hyper_5 = {}
hyper_5 ['classifier__hidden_layer_sizes'] = [(20,30),(30,30,20),(100,40,20,10),(100,50,30,20,20,10)]
hyper_5 ['classifier__activation'] = ['identity','logistic','tanh','relu']
hyper_5 ['classifier__learning_rate'] = ['constant','invscaling','adaptive']
hyper_5['classifier'] = [models['MLP_2']]
# SVM model 
hyper_6 = {}
hyper_6['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
hyper_6['classifier__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
hyper_6['classifier__shrinking'] = [True,False]
hyper_6['classifier'] = [models['SVM']]
# Decision Tree Model
hyper_7 = {}
hyper_7['classifier__criterion'] = ['gini','entropy','log_loss']
hyper_7['classifier__splitter'] = ['best', 'random']
hyper_7['classifier__max_depth'] = [10,20,50,100]
hyper_7['classifier__min_samples_split'] = [2,5,10,15,25,50]
hyper_7['classifier'] = [models['DT']]


In [32]:
pipe = Pipeline([('classifier', models['kNN'])])
params = [hyper_1,hyper_2,hyper_3,hyper_4,hyper_5,hyper_6,hyper_7]

In [33]:
k_fold  = KFold(n_splits=3,random_state=42,shuffle=True)

In [34]:
gridCV  = GridSearchCV(estimator=pipe,param_grid=params,cv=k_fold,n_jobs=-1,scoring='accuracy',return_train_score=True)

In [35]:
%%time
gridCV.fit(X_train,y_train)

KeyboardInterrupt: 