# Importando Bibliotecas

In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier


# Inicializando Dataframes

In [2]:
df = pd.read_csv('exame_cmc13_dados_treinamento.csv', sep=';')
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'user_id', 'age', 'isbn', 'rating',
       'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_l', 'Language', 'Category', 'city', 'state', 'country'],
      dtype='object')

# Tratando Dados

Vemos campos que não são mencionados e que, provavelmente, se referem apenas a identificações internas dos livros: 'Unnamed: 0.1' e 'Unnamed: 0'. Vamos excluí-los.

In [3]:
df = df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)

Vamos começar a exluir colunas inúteis para a análise. Em um primeiro momento, podemos excluir as colunas de identificação do usuário:

In [4]:
df = df.drop(['user_id'], axis=1)

Agora, vamos filtrar as localidades. Para simplificar a análise, contaremos que leitores de um mesmo país têm gostos semelhantes, excluindo a necessidade de identificadores de cidades e estados. Isso também é permitido pelo fato de haver uma variedade de países:

In [5]:
df['country'].value_counts()

usa                102798
canada              12113
united kingdom       2577
australia            1631
germany              1077
                    ...  
mozambique              1
new jersey, usa         1
ontario, canada         1
iowa, usa               1
haiti                   1
Name: country, Length: 187, dtype: int64

Existem 116 países no dataset. Excluindo colunas referentes a localidades:

In [6]:
df = df.drop(['city', 'state'], axis=1)

Análise da linguagem dos livros:

In [7]:
df['Language'].value_counts()

en    87209
9     43970
Name: Language, dtype: int64

Veja que apenas duas classificações foram categorizadas: en (inglês) e 9 (provavelmente um placeholder ou erro de obtenção de dados). Como a análise desses dois classificadores não nos fornece tanta informação, é razoável excluir tal coluna.

In [8]:
df = df.drop(['Language'], axis=1)

Podemos excluir as colunas 'isbn' porque se refere a uma identificação do livro, redundante com o título, e 'img_l', pois, apesar de a capa certamente ser importante para a escolha de um livro, apenas o link da imagem não adiciona tanto à análise.

In [9]:
df = df.drop(['isbn', 'img_l'], axis=1)

Veja que as colunas de idade e ano de publicação estão sendo tratadas como floats. Convertendo ambas para int:

In [10]:
df['age'] = df['age'].astype(int)
df['year_of_publication'] = df['year_of_publication'].astype(int)

Entretanto, ainda devem existir colunas com valores nulos. Vamos contá-las:

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131179 entries, 0 to 131178
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   age                  131179 non-null  int32 
 1   rating               131179 non-null  int64 
 2   book_title           131179 non-null  object
 3   book_author          131179 non-null  object
 4   year_of_publication  131179 non-null  int32 
 5   publisher            131179 non-null  object
 6   Category             131179 non-null  object
 7   country              126615 non-null  object
dtypes: int32(2), int64(1), object(5)
memory usage: 7.0+ MB


Veja que apenas 126615 linhas de 'country' tem valores não nulos, contra 131179 das outras colunas. Como são apenas 4564 de 130000 dados, representando aproximadamente 3% do dataset, podemos excluí-los sem grande perda:

In [12]:
df = df.dropna()

# Construção do modelo MLP

In [13]:
X_train = df.drop(['rating'], axis = 1)
Y_train = df['rating']

In [14]:
X_train

Unnamed: 0,age,book_title,book_author,year_of_publication,publisher,Category,country
0,34,The Testament,John Grisham,1999,Dell,['Fiction'],usa
1,34,The Testament,John Grisham,1999,Dell,['Fiction'],usa
2,35,The Testament,John Grisham,1999,Dell,['Fiction'],usa
3,29,The Testament,John Grisham,1999,Dell,['Fiction'],usa
4,31,The Testament,John Grisham,1999,Dell,['Fiction'],usa
...,...,...,...,...,...,...,...
131174,25,SEAT OF THE SOUL,Gary Zukav,1990,Free Press,"['Body, Mind & Spirit']",usa
131175,62,SEAT OF THE SOUL,Gary Zukav,1990,Free Press,"['Body, Mind & Spirit']",usa
131176,53,SEAT OF THE SOUL,Gary Zukav,1990,Free Press,"['Body, Mind & Spirit']",usa
131177,37,SEAT OF THE SOUL,Gary Zukav,1990,Free Press,"['Body, Mind & Spirit']",usa


In [15]:
X_train.columns.array

<PandasArray>
[                'age',          'book_title',         'book_author',
 'year_of_publication',           'publisher',            'Category',
             'country']
Length: 7, dtype: object

In [16]:
for column_name in X_train.columns:
    if X_train[column_name].dtype == object:
        X_train = pd.get_dummies(X_train, columns=[column_name])
    else:
        pass
# X_train = pd.get_dummies(X_train, columns = ['book_title', 'book_author', 'publisher', 'Category', 'country'])

In [17]:
X_train

Unnamed: 0,age,year_of_publication,book_title_1984,book_title_1st to Die: A Novel,book_title_2nd Chance,book_title_4 Blondes,book_title_A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,book_title_A Bend in the Road,book_title_A Case of Need,"book_title_A Child Called \It\"": One Child's Courage to Survive""",...,country_wales,"country_washington, usa","country_west indies, tobago","country_west yorkshire, united kingdom",country_worcester,country_ysa,"country_yu-song, guam-dong, 626-1, 302, south korea",country_yugoslavia,country_zambia,country_zimbabwe
0,34,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,35,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,31,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131174,25,1990,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131175,62,1990,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131176,53,1990,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131177,37,1990,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Carregando os dados de teste:

In [18]:
df_test = pd.read_csv('exame_cmc13_dados_teste.csv', sep=';')

Aplicando o mesmo tratamento feito aos dados de treino:

In [19]:
df_test = df_test.drop(['Unnamed: 0.1', 'Unnamed: 0', 'user_id', 'city', 'state', 'Language', 'isbn', 'img_l'], axis=1)
df_test['age'] = df_test['age'].astype(int)
df_test['year_of_publication'] = df_test['year_of_publication'].astype(int)

X_test = df_test.drop(['rating'], axis = 1)
Y_test = df_test['rating']

for column_name in X_test.columns:
    if X_test[column_name].dtype == object:
        X_test = pd.get_dummies(X_test, columns=[column_name])
    else:
        pass

In [20]:
X_test

Unnamed: 0,age,year_of_publication,book_title_1984,book_title_1st to Die: A Novel,book_title_2nd Chance,book_title_4 Blondes,book_title_A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,book_title_A Bend in the Road,book_title_A Case of Need,"book_title_A Child Called \It\"": One Child's Courage to Survive""",...,country_united kingdom,country_united state,country_united states,country_universe,country_us,country_usa,country_van wert,country_venezuela,"country_virginia, usa","country_zapopan, mexico"
0,35,2002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,34,2002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,33,1989,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,33,1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,44,1999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32790,59,2002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
32791,34,1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
32792,25,1996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
32793,55,2002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Vamos testar qual o número de camadas que otimiza a classificação. Para isso, façamos uma lista com as porcentagens de acerto:

In [21]:
clf = MLPClassifier(
    hidden_layer_sizes=(150, 150),
    max_iter=500,
    activation='relu',
    solver='adam',
    random_state=1
).fit(X_train,Y_train)

Testando a precisão do modelo:

In [23]:
clf.score(X_train, Y_train)

0.593010306835683

# Construção da Árvore de Decisão:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [None]:
parametros = {'criterion': ['gini', 'entropy'],
             'splitter': ['best','random'],
             'min_samples_split': [2,5,10],
             'min_samples_leaf':[1,5,10]}

In [None]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parametros)
grid_search.fit(X_train, Y_train)

In [None]:
melhores_parametros = grid_search.best_params_

In [None]:
tree_books = DecisionTreeClassifier(criterion="entropy", min_samples_leaf = 10, min_samples_split = 5 , splitter = "random")

In [None]:
tree_books.fit(X_train, Y_train)

In [None]:
melhor_resultado = grid_search.best_score_

In [None]:
tree_books.score(X_test, Y_test)

In [None]:
Y_pred = tree_books.predict(X_test)

In [None]:
Y_pred_train = tree_books.predict(X_train)
print(metrics.accuracy_score(Y_train, Y_pred_train))
print(metrics.cohen_kappa_score(Y_train, Y_pred_train))

In [None]:
print(pd.crosstab(Y_train, Y_pred_train, rownames=['Data de teste'], colnames=['Previsão']))

In [None]:
print(metrics.classification_report(Y_train, Y_pred_train))

### Análise dos dados de teste

In [None]:
print(metrics.accuracy_score(Y_test, Y_pred))
print(metrics.cohen_kappa_score(Y_test, Y_pred))

In [None]:
print(pd.crosstab(Y_test, Y_pred, rownames=['Data de teste'], colnames=['Previsão']))

In [None]:
print(metrics.classification_report(Y_test, Y_pred))

# Construção da Random Forest

Importando bibliotecas

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Criando classificador e treino

In [None]:
forest = RandomForestClassifier(n_estimators=100,criterion='entropy',max_features=None)
forest.fit(X_train, Y_train)

Análise dos dados de treino:

In [None]:
Y_pred_treino = forest.predict(X_train)
print(metrics.accuracy_score(Y_train, Y_pred_treino))
print(metrics.cohen_kappa_score(Y_train, Y_pred_treino))

In [None]:
print(pd.crosstab(Y_train, Y_pred_treino, rownames=['Dados de teste'], colnames=['Previsão']))

In [None]:
print(metrics.classification_report(Y_train, Y_pred_treino))

Análise dos dados de teste:

In [None]:
Y_pred = forest.predict(X_test)
print(metrics.accuracy_score(Y_test, Y_pred))
print(metrics.cohen_kappa_score(Y_test, Y_pred))

In [None]:
print(pd.crosstab(Y_test, Y_pred, rownames=['Dados de teste'], colnames=['Previsão']))

In [None]:
print(metrics.classification_report(Y_test, Y_pred))

Análise da importância de cada atributo

In [None]:
for i in range(len(X_test.columns)):
	print("A importância do atributo",(X_test.columns)[i]," é de ", round(forest.feature_importances_[i] * 100,2),"%.")