In [9]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.6.3 (default, Oct  3 2017, 21:45:48) 
[GCC 7.2.0]
NLTK: 3.4.4
Scikit-learn: 0.21.3
Pandas: 0.25.0
Numpy: 1.17.0


In [10]:
# Carregando dados
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/big/dataset_multi.txt', header=None, encoding='utf-8', sep='\t')
df = df.dropna()
df = df.groupby(by=0).apply(lambda x: x.sample(df.groupby(by=0).size().min()))
df = df.sample(frac=1)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9740 entries, (administrativo, 214401) to (tributário, 191815)
Data columns (total 2 columns):
0    9740 non-null object
1    9740 non-null object
dtypes: object(2)
memory usage: 256.9+ KB
None
                                     0  \
0                                        
administrativo 214401   administrativo   
previdenciário 876919   previdenciário   
outros         1288379          outros   
previdenciário 656233   previdenciário   
administrativo 664101   administrativo   

                                                                        1  
0                                                                          
administrativo 214401   indigit alegaca funcion ilegal mencion radiodi...  
previdenciário 876919   levant rend inicial auxiliodoenc segur acomet ...  
outros         1288379  part recorrent obrig efetu deposit legal integ...  
previdenciário 656233   novembr efet restabelec refer categor exclu de...  
admin

In [11]:
# verificação das classes dos acórdãos

classes = df[0]
print(classes.value_counts())

ambiental         974
constitucional    974
administrativo    974
internacional     974
outros            974
penal             974
processual        974
civil             974
previdenciário    974
tributário        974
Name: 0, dtype: int64


In [12]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes em binário
encoder = LabelEncoder()
binary = encoder.fit_transform(classes)

print(binary[:10])

[0 7 5 7 0 3 7 7 5 8]


In [13]:
# criação da lista de ementas
ementas = df[1]
print(ementas[:10])

administrativo  214401     indigit alegaca funcion ilegal mencion radiodi...
previdenciário  876919     levant rend inicial auxiliodoenc segur acomet ...
outros          1288379    part recorrent obrig efetu deposit legal integ...
previdenciário  656233     novembr efet restabelec refer categor exclu de...
administrativo  664101     julg profer tribunal quo nao possu contradica ...
constitucional  1284883    caracteriz contrat trabalh evidenc empreg publ...
previdenciário  184184     calcul rend mensal inicial benefici dev ser ut...
                837998     autor aposentador invalidez revist cancel proc...
outros          790434     fixaca penabas sao analis oit circunstanc judi...
processual      340685     present entend superior tribunal justic credor...
Name: 1, dtype: object


In [14]:
# The following script uses the bag of words model to convert text documents into 
# corresponding numerical features:

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = vectorizer.fit_transform(ementas).toarray()

print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
#  The following script directly convert text documents into TFIDF feature values 
# (without first converting documents to bag of words features)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(ementas).toarray()

print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [16]:
# Like any other supervised machine learning problem, we need to divide data into 
# 20% test set and 80% training set

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


X_train, X_test, y_train, y_test = train_test_split(X, binary, test_size=0.2, random_state=0)

print(X_train)


[[0.         0.         0.         ... 0.         0.06588102 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200),
#     SGDClassifier(max_iter = 100, tol=1e-3),
#     MultinomialNB(),
#     SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 16.27310061601643
Decision Tree Accuracy: 47.84394250513347
Random Forest Accuracy: 62.83367556468173
Logistic Regression Accuracy: 61.139630390143736


In [18]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


votingClassifier = VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1)
votingClassifier.fit(X_train, y_train)
y_pred = votingClassifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 61.90965092402464


In [19]:
# relatório de classificação

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 90   6  30  15   0   7   5  13  14   8]
 [  8 147   5   4   2   3  19   2   4   0]
 [ 32   6  93   6   4  16   5   3  23   4]
 [ 22   2   7  95   1  15   6  12   5  23]
 [  2   6   7   0 171   4  27   1   1   2]
 [ 12   2  16  10   1 117  11   3   7   5]
 [  5   4   3   3  15   8 151   0   1   2]
 [ 12   1   5   4   0  16   0 151   7   3]
 [ 21   3  21  13   0   3   1  25  58  38]
 [  9   3   8  28   0   2   2   1  21 133]]
              precision    recall  f1-score   support

           0       0.42      0.48      0.45       188
           1       0.82      0.76      0.79       194
           2       0.48      0.48      0.48       192
           3       0.53      0.51      0.52       188
           4       0.88      0.77      0.82       221
           5       0.61      0.64      0.62       184
           6       0.67      0.79      0.72       192
           7       0.72      0.76      0.74       199
           8       0.41      0.32      0.36       183
           9       0.61     