In [2]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.3 (default, Jun 24 2019, 04:54:02) 
[GCC 9.1.0]
NLTK: 3.4.1
Scikit-learn: 0.21.2
Pandas: 0.24.2
Numpy: 1.16.4


In [3]:
# Carregando dados
import pandas as pd

df = pd.read_csv('../data/big/dataset_multi.txt', header=None, encoding='utf-8', sep='\t')
df = df.dropna()

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324772 entries, 0 to 1653761
Data columns (total 2 columns):
0    1324772 non-null object
1    1324772 non-null object
dtypes: object(2)
memory usage: 30.3+ MB
None
            0                                                  1
0      outros  pretensa estadual deduz apel extrem esbarr ved...
1       civil  embarg insist mesm razo recurs apresent oposic...
2  processual  acorda ora recorr nao neg vigenc direit federa...
3      outros  segund jurisprudenc superior tribunal justic a...
4      outros  especial superior tribunal justic decid ser ne...


In [4]:
# verificação das classes dos acórdãos

classes = df[0]
# print(classes)
print(len(classes.value_counts()))
# classes.value_counts().to_frame().style.bar()


10


In [5]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes
encoder = LabelEncoder()
classes = encoder.fit_transform(classes)

print(classes[:10])
len(classes)

[5 2 8 5 5 3 6 5 6 0]


1324772

In [6]:
# criação da lista de ementas

ementas = df[1]
print(ementas[:10])

0    pretensa estadual deduz apel extrem esbarr ved...
1    embarg insist mesm razo recurs apresent oposic...
2    acorda ora recorr nao neg vigenc direit federa...
3    segund jurisprudenc superior tribunal justic a...
4    especial superior tribunal justic decid ser ne...
5    inadmissivel especial interpost fundament art ...
6    inadmissivel exigenc recolh reu prisa requisit...
7    tribunal justic assent compreensa part det leg...
8    dentr limit leg vez caracteriz reincidenc agra...
9    ato administraca reconhec direit correca monet...
Name: 1, dtype: object


In [7]:
# uso de word2vec para a extração de features

from gensim.models import Word2Vec
import os.path

model_file_location = '../data/big/ementas.model'

if os.path.exists(model_file_location):
    X = Word2Vec.load(model_file_location)
else:
    ementas_list = [word for word in ementas.iteritems()]
    ementas_list_list = []
    for doc in ementas_list:
        ementas_list_list.append(doc[1].split())

    X = Word2Vec(ementas_list_list, min_count=2, window=5, workers=4)
    X.wv.init_sims() # para inicializar model.vw.syn0norm, necessário para o cálculo da média da
    X.save(model_file_location)

print(X)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Word2Vec(vocab=101959, size=100, alpha=0.025)


In [8]:
# funções para calcular a média vetorial das palavras de cada ementa
import numpy as np


def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        # print("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [9]:
# função para tokenizar as palavras com frequência maior que 2
import nltk


def w2v_tokenize_text(text):    
    tokens = []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [None]:
# Like any other supervised machine learning problem, we need to divide data into 
# 20% test set and 80% training set

import gensim
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


train, test = train_test_split(df, test_size=0.2, random_state=0)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r[1]), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r[1]), axis=1).values

X_train = word_averaging_list(X.wv,train_tokenized)
X_test = word_averaging_list(X.wv,test_tokenized)

y_train = train[0]
y_test = test[0]
print("train: {},{}".format(len(X_train),len(y_train)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    print('Fitting ' + name + ' classifier. ')
    classifier.fit(X_train, y_train)
    print('done.')
    
    print('predicting...')
    y_pred = classifier.predict(X_test)
    print('done')
    accuracy = accuracy_score(y_test, y_pred)*100
    print("{} Accuracy: {}".format(name, accuracy))

In [None]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


votingClassifier = VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1)
votingClassifier.fit(X_train, y_train)
y_pred = votingClassifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

In [None]:
# relatório de classificação

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print((accuracy_score(y_test, y_pred)*100))

In [None]:
# usando undersampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy='not minority', random_state=0)
rus.fit(X_train, y_train)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("train: {},{}".format(len(X_resampled),len(y_resampled)))

series = []
for y in y_resampled:
    series.append(y)
    
nltk.FreqDist(series).most_common(20)

In [None]:
# Fazendo a mesma análisa usando undersampling

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    classifier.fit(X_resampled, y_resampled)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print("{} Accuracy: {}".format(name, accuracy))

In [None]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


votingClassifier = VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1)
votingClassifier.fit(X_resampled, y_resampled)
y_pred = votingClassifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

In [None]:
# relatório de classificação

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print((accuracy_score(y_test, y_pred)*100))