In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 2.7.15+ (default, Nov 27 2018, 23:36:35) 
[GCC 7.3.0]
NLTK: 3.2.5
Scikit-learn: 0.20.3
Pandas: 0.24.2
Numpy: 1.14.1


In [3]:
# Carregando dados
import pandas as pd
import numpy as np

df = pd.read_csv('data/dataset_tratado.txt', header=None, encoding='utf-8', sep='\t')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 2 columns):
0    793 non-null bool
1    793 non-null object
dtypes: bool(1), object(1)
memory usage: 7.0+ KB
None
       0                                                  1
0  False  mand segur inexecu contrat sançã afast reexam ...
1  False  respons civil acident aér decret lei n dol eve...
2  False  processual civil intim art cpc irregular compr...
3  False  process civil honorári execu legitim part supe...
4  False  administr servidor públic artig lei n artig le...


In [4]:
# verificação das classes dos acórdãos
classes = df[0]
print(classes.value_counts())

False    525
True     268
Name: 0, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes em binário
encoder = LabelEncoder()
binary = encoder.fit_transform(classes)

print(binary[:10])

[0 0 0 0 0 0 1 1 0 0]


In [6]:
# criação da lista de ementas
ementas = df[1]
print(ementas[:10])

0    mand segur inexecu contrat sançã afast reexam ...
1    respons civil acident aér decret lei n dol eve...
2    processual civil intim art cpc irregular compr...
3    process civil honorári execu legitim part supe...
4    administr servidor públic artig lei n artig le...
5    impost sobr servic iss arrend mercantil incide...
6    petiçã receb hab corpus processual penal art c...
7    penal process penal recurs especial art incis ...
8    tribut sujeit lançament homolog praz prescrici...
9    transport intermunicipal licit açã anulatór in...
Name: 1, dtype: object


In [7]:
from nltk.tokenize import word_tokenize

# criação do BOW (bag of words)
all_words = []

for ementa in ementas:
    words = word_tokenize(ementa)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [8]:
# Exibição do número total de palavras e as 15 mais comuns
print('Número de palavras: {}'.format(len(all_words)))
print('Palavras mais comuns: {}'.format(all_words.most_common(15)))

Número de palavras: 4549
Palavras mais comuns: [(u'recurs', 1056), (u'agrav', 1018), (u'especial', 871), (u'lei', 870), (u'embarg', 847), (u'art', 812), (u'n', 796), (u'regimental', 641), (u'\xe9', 641), (u's\xfamul', 590), (u'civil', 547), (u'tribunal', 482), (u'stj', 462), (u'declar', 456), (u'ser', 420)]


In [9]:
# Uso das 1500 palavras mais comuns como características
word_features = list(all_words.keys())[:1500]

In [10]:
# Função determina quais das 1500 palavras características estão contidas nas ementas

def find_features(ementa):
    words = word_tokenize(ementa)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Por exemplo
features = find_features(ementas[0])
for key, value in features.items():
    if value == True:
        print(key)

aut
med
total
conjunt
necessári
contratu
cont
ii
estadual
deslind
extrem
apel
controvérs
vedaçõ


In [12]:
# Aplicando à todas as ementas
ementas_zip = list(zip(ementas, binary))
zipped_list_1 = ementas_zip[:]
zipped_list_2 = list(ementas_zip) 

# embaralhando lista
seed = 1
np.random.seed = seed
np.random.shuffle(zipped_list_2)

# chamando a função find_features para cada ementa
featuresets = [(find_features(text), label) for (text, label) in ementas_zip]

In [13]:
# Separação dos conjuntos de característas em treinamento e teste usando sklean
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [14]:
print(len(training))
print(len(testing))

594
199


In [15]:
# Importação das bibliotecas para uso dos modelos

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# treinando o modelo com os dados de treino
model.train(training)

# teste do dataset
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 69.8492462312


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    nltk_model = SklearnClassifier(classifier)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 60.3015075377
Decision Tree Accuracy: 78.391959799




Random Forest Accuracy: 72.864321608




Logistic Regression Accuracy: 72.3618090452




SGD Classifier Accuracy: 72.864321608
Naive Bayes Accuracy: 71.3567839196
SVM Linear Accuracy: 69.8492462312


In [17]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 69.8492462312


In [18]:
# criação de classe de label de predição para conjunto de teste
txt_features_2, labels_2 = list(zip(*testing))
txt_features_1, labels_1 = txt_features_2[:], labels_2[:]
txt_features, labels = list(txt_features_2), list(labels_2)

prediction = nltk_ensemble.classify_many(txt_features)

In [19]:
# matriz de confusão e um relatório de classificação
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['resultado', 'resultado'], ['deferido', 'indeferido']],
    columns = [['previsto', 'previsto'], ['deferido', 'indeferido']])

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       136
           1       0.59      0.48      0.53        63

   micro avg       0.73      0.73      0.73       199
   macro avg       0.68      0.66      0.67       199
weighted avg       0.72      0.73      0.72       199



Unnamed: 0_level_0,Unnamed: 1_level_0,previsto,previsto
Unnamed: 0_level_1,Unnamed: 1_level_1,deferido,indeferido
resultado,deferido,115,21
resultado,indeferido,33,30
