In [None]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

In [3]:
# Carregando dados
import pandas as pd
import numpy as np

df = pd.read_csv('data/dataset_tratado.txt', header=None, encoding='utf-8', sep='\t')
df.fillna('x', inplace=True)
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 2 columns):
0    793 non-null bool
1    793 non-null object
dtypes: bool(1), object(1)
memory usage: 7.0+ KB
None
       0                                                  1
0  False  pretensa estadual deduz apel extrem esbarr ved...
1  False  embarg insist mesm razo recurs apresent oposic...
2  False  acorda ora recorr nao neg vigenc direit federa...
3  False  tribunal justic assent compreensa part det leg...
4  False  segund jurisprudenc superior tribunal justic a...


In [4]:
# verificação das classes dos acórdãos

binary = df[0]
print(binary.value_counts())

False    525
True     268
Name: 0, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes
encoder = LabelEncoder()
binary = encoder.fit_transform(binary)

print(binary[:10])
len(binary)

[0 0 0 0 0 0 1 1 0 0]


793

In [6]:
# criação da lista de ementas
ementas = df[1]
print(ementas[:10])

0    pretensa estadual deduz apel extrem esbarr ved...
1    embarg insist mesm razo recurs apresent oposic...
2    acorda ora recorr nao neg vigenc direit federa...
3    tribunal justic assent compreensa part det leg...
4    segund jurisprudenc superior tribunal justic a...
5    sobrest recurs especial ate julgament recurs e...
6    inadmissivel exigenc recolh reu prisa requisit...
7    dentr limit leg vez caracteriz reincidenc agra...
8    sobr prescrica aca repetica indebit tributari ...
9    tratas agrav instrument interpost contr decisa...
Name: 1, dtype: object


In [7]:
# uso de word2vec para a extração de features

from gensim.models import Word2Vec

ementas_list = [word for word in ementas.iteritems()]
ementas_list_list = []
for doc in ementas_list:
    ementas_list_list.append(doc[1].split())

X = Word2Vec(ementas_list_list, min_count=2)
X.wv.init_sims() # para inicializar model.vw.syn0norm, necessário para o cálculo da média da
print(X)

Word2Vec(vocab=3183, size=100, alpha=0.025)


In [11]:
# funções para calcular a média vetorial das palavras de cada ementa

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        print("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [12]:
# função para tokenizar as palavras com frequência maior que 2

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [13]:
# Like any other supervised machine learning problem, we need to divide data into 
# 20% test set and 80% training set

import gensim
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


train, test = train_test_split(df, test_size=0.2, random_state=0)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r[1]), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r[1]), axis=1).values

X_train = word_averaging_list(X.wv,train_tokenized)
X_test = word_averaging_list(X.wv,test_tokenized)

y_train = train[0]
y_test = test[0]

print(X_train)
print(X_test)

cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
cannot compute similarity with no input %s []
[[ 0.155413   -0.05996682 -0.13512392 ... -0.10904473 -0.01001627
  -0.04182275]
 [ 0.1553461  -0.06026132 -0.13671583 ... -0.10850698 -0.01447462
  -0.04481162]
 [ 0.15235

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0),
    LogisticRegression(solver='lbfgs'),
    SGDClassifier(max_iter = 100),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 73.58490566037736
Decision Tree Accuracy: 65.40880503144653
Random Forest Accuracy: 76.10062893081762
Logistic Regression Accuracy: 62.893081761006286
SGD Classifier Accuracy: 62.893081761006286
Naive Bayes Accuracy: 62.893081761006286


In [84]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0),
    LogisticRegression(solver='lbfgs'),
    SGDClassifier(max_iter = 100),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


votingClassifier = VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1)
votingClassifier.fit(X_train, y_train)
y_pred = votingClassifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 66.6666666667


In [86]:
# relatório de classificação

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print((accuracy_score(y_test, y_pred)*100))

[[93  7]
 [46 13]]
              precision    recall  f1-score   support

       False       0.67      0.93      0.78       100
        True       0.65      0.22      0.33        59

   micro avg       0.67      0.67      0.67       159
   macro avg       0.66      0.58      0.55       159
weighted avg       0.66      0.67      0.61       159

66.66666666666666
