In [26]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.3 (default, Jun 24 2019, 04:54:02) 
[GCC 9.1.0]
NLTK: 3.4.1
Scikit-learn: 0.21.2
Pandas: 0.24.2
Numpy: 1.16.4


In [27]:
# Carregando dados
import pandas as pd
import numpy as np

df = pd.read_csv('data/dataset_tratado.txt', header=None, encoding='utf-8', sep='\t')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 2 columns):
0    793 non-null bool
1    793 non-null object
dtypes: bool(1), object(1)
memory usage: 7.0+ KB
None
       0                                                  1
0  False  mand seguranc inexecuca contrat sanca afast re...
1  False  respons civil acident aer decretol n dol event...
2  False  processual civil intimaca art cpc irregular na...
3  False  process civil honorari execuca legitim part su...
4  False  administr servidor public artig lei n artig le...


In [28]:
# verificação das classes dos acórdãos
classes = df[0]
print(classes.value_counts())

False    525
True     268
Name: 0, dtype: int64


In [29]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes em binário
encoder = LabelEncoder()
binary = encoder.fit_transform(classes)

print(binary[:10])

[0 0 0 0 0 0 1 1 0 0]


In [30]:
# criação da lista de ementas
ementas = df[1]
print(ementas[:10])

0    mand seguranc inexecuca contrat sanca afast re...
1    respons civil acident aer decretol n dol event...
2    processual civil intimaca art cpc irregular na...
3    process civil honorari execuca legitim part su...
4    administr servidor public artig lei n artig le...
5    impost sobr servic iss arrend mercantil incide...
6    petica receb hab corpus processual penal art c...
7    penal process penal recurs especial art incis ...
8    tribut sujeit lancament homologaca praz prescr...
9    transport intermunicipal licitaca aca anulator...
Name: 1, dtype: object


In [32]:
from nltk.tokenize import word_tokenize

# criação do BOW (bag of words)
all_words = []

for ementa in ementas:
    words = word_tokenize(ementa)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [33]:
# Exibição do número total de palavras e as 15 mais comuns
print('Número de palavras: {}'.format(len(all_words)))
print('Palavras mais comuns: {}'.format(all_words.most_common(15)))

Número de palavras: 5114
Palavras mais comuns: [('nao', 1735), ('recurs', 1056), ('agrav', 1018), ('especial', 869), ('embarg', 847), ('art', 812), ('n', 790), ('lei', 779), ('regimental', 641), ('sumul', 608), ('civil', 545), ('tribunal', 481), ('stj', 454), ('ser', 435), ('declaraca', 392)]


In [34]:
# Uso das 1500 palavras mais comuns como características
word_features = list(all_words.keys())[:1500]

In [35]:
# Função determina quais das 1500 palavras características estão contidas nas ementas

def find_features(ementa):
    words = word_tokenize(ementa)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Por exemplo
features = find_features(ementas[0])
for key, value in features.items():
    if value == True:
        print(key)

mand
seguranc
inexecuca
contrat
sanca
afast
reexam
prov
clausul
contratu
sumul
stj
pretensa
estadual
deduz
apel
extrem
esbarr
vedaco
cont
med
necessari
conjunt
faticoprobatori
aut
bem
deslind
controvers
total
inviavel
agrav
improv


In [36]:
# Aplicando à todas as ementas
ementas_zip = list(zip(ementas, binary))
zipped_list_1 = ementas_zip[:]
zipped_list_2 = list(ementas_zip) 

# embaralhando lista
seed = 1
np.random.seed = seed
np.random.shuffle(zipped_list_2)

# chamando a função find_features para cada ementa
featuresets = [(find_features(text), label) for (text, label) in ementas_zip]

In [37]:
# Separação dos conjuntos de característas em treinamento e teste usando sklean
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [38]:
print(len(training))
print(len(testing))

594
199


In [39]:
# Importação das bibliotecas para uso dos modelos

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# treinando o modelo com os dados de treino
model.train(training)

# teste do dataset
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 77.88944723618091


In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    nltk_model = SklearnClassifier(classifier)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 49.246231155778894
Decision Tree Accuracy: 80.40201005025126




Random Forest Accuracy: 80.90452261306532




Logistic Regression Accuracy: 75.87939698492463
SGD Classifier Accuracy: 78.89447236180904
Naive Bayes Accuracy: 71.85929648241206
SVM Linear Accuracy: 77.88944723618091


In [41]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 77.88944723618091


In [42]:
# criação de classe de label de predição para conjunto de teste
txt_features_2, labels_2 = list(zip(*testing))
txt_features_1, labels_1 = txt_features_2[:], labels_2[:]
txt_features, labels = list(txt_features_2), list(labels_2)

prediction = nltk_ensemble.classify_many(txt_features)

In [43]:
# matriz de confusão e um relatório de classificação
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['resultado', 'resultado'], ['deferido', 'indeferido']],
    columns = [['previsto', 'previsto'], ['deferido', 'indeferido']])

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       136
           1       0.64      0.70      0.67        63

    accuracy                           0.78       199
   macro avg       0.75      0.76      0.75       199
weighted avg       0.79      0.78      0.78       199



Unnamed: 0_level_0,Unnamed: 1_level_0,previsto,previsto
Unnamed: 0_level_1,Unnamed: 1_level_1,deferido,indeferido
resultado,deferido,111,25
resultado,indeferido,19,44
