**Importações**

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import ClassifierChain

**Ajustando a base de dados**

In [4]:
questions = pd.read_csv('/content/stackoverflow_perguntas.csv')
questions.head()

Unnamed: 0,Perguntas,Tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js
1,"Gostaria de fazer testes unitários no Node.js,...",node.js
2,Como inverter a ordem com que o jQuery itera u...,jquery
3,Eu tenho uma página onde pretendo utilizar um ...,html
4,Como exibir os dados retornados do FireStore e...,html angular


Pegando as tags únicas e repassando de forma númerica para o dataframe

In [5]:
tags_list = list()
for tags in questions.Tags.unique():
  for tag in tags.split():
    if tag not in tags_list:
      tags_list.append(tag)

In [7]:
def coluna_nova(listas_tags,dataframe, nome_tags):
  for tag in listas_tags:
    coluna = list()
    for linha_tag in dataframe[nome_tags]:
      if tag in linha_tag:
        coluna.append(1)
      else:
        coluna.append(0)
    dataframe[tag] = coluna
coluna_nova(tags_list, questions,'Tags')

Juntando as informações númericas das tags

In [8]:
lista_zip_tags = list(zip(questions[tags_list[0]],
                          questions[tags_list[1]],
                          questions[tags_list[2]],
                          questions[tags_list[3]],))
questions['Todas_as_tags'] = lista_zip_tags

In [9]:
questions.head()

Unnamed: 0,Perguntas,Tags,node.js,jquery,html,angular,Todas_as_tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js,1,0,0,0,"(1, 0, 0, 0)"
1,"Gostaria de fazer testes unitários no Node.js,...",node.js,1,0,0,0,"(1, 0, 0, 0)"
2,Como inverter a ordem com que o jQuery itera u...,jquery,0,1,0,0,"(0, 1, 0, 0)"
3,Eu tenho uma página onde pretendo utilizar um ...,html,0,0,1,0,"(0, 0, 1, 0)"
4,Como exibir os dados retornados do FireStore e...,html angular,0,0,1,1,"(0, 0, 1, 1)"


**Criando a base de teste e de treino**

In [10]:
X_train,X_test,y_train,y_test = train_test_split(questions.Perguntas,
                                                 questions.Todas_as_tags,
                                                 test_size = 0.2,
                                                 random_state = 321)
y_train_array = np.asarray(list(y_train))
y_test_array = np.asarray(list(y_test))

In [11]:
vector = TfidfVectorizer(max_features= 5000,max_df=0.85)
vector.fit(questions.Perguntas)
X_train_tfidf = vector.transform(X_train)
X_test_tfidf = vector.transform(X_test)

**Modelo onevsrest**

In [14]:
logistic_reg = LogisticRegression(solver= 'lbfgs')
onevsrest_classifier = OneVsRestClassifier(logistic_reg)
onevsrest_classifier.fit(X_train_tfidf,y_train_array)

OneVsRestClassifier(estimator=LogisticRegression())

**Resultados onevsrest**

In [15]:
results_onevsrest = onevsrest_classifier.score(X_test_tfidf,y_test_array)
onevsrest_predict = onevsrest_classifier.predict(X_test_tfidf)
hamming_loss_onevsrest = hamming_loss(y_test_array,onevsrest_predict)
print('Resultado onevsrestclassifier: {:.2f}%'.format(results_onevsrest * 100))
print('Hamming loss onevsrestclassifier: {:.2f}'.format(hamming_loss_onevsrest))

Resultado onevsrestclassifier: 38.82%
Hamming loss onevsrestclassifier: 0.20


**Modelo Chain**

In [16]:
chain_classifier = ClassifierChain(logistic_reg)
chain_classifier.fit(X_train_tfidf,y_train_array)

ClassifierChain(classifier=LogisticRegression(), require_dense=[True, True])

**Resultados Chain**

In [17]:
results_chain = chain_classifier.score(X_test_tfidf,y_test_array)
chain_predict = chain_classifier.predict(X_test_tfidf)
hamming_loss_chain = hamming_loss(y_test_array,chain_predict)
print('Resultado Chain classifier: {:.2f}%'.format(results_chain * 100))
print('Hamming loss Chain classifier: {:.2f}'.format(hamming_loss_chain))

Resultado Chain classifier: 49.72%
Hamming loss Chain classifier: 0.21
