In [2]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.6.3 (default, Oct  3 2017, 21:45:48) 
[GCC 7.2.0]
NLTK: 3.4.4
Scikit-learn: 0.21.3
Pandas: 0.25.0
Numpy: 1.17.0


In [3]:
# Carregando dados
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/big/dataset_multi.txt', header=None, encoding='utf-8', sep='\t')
df = df.dropna()
df = df.groupby(by=0).apply(lambda x: x.sample(df.groupby(by=0).size().min()))
df = df.sample(frac=1)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9740 entries, (outros, 1484423) to (outros, 1535418)
Data columns (total 2 columns):
0    9740 non-null object
1    9740 non-null object
dtypes: object(2)
memory usage: 256.9+ KB
None
                                     0  \
0                                        
outros         1484423          outros   
constitucional 688167   constitucional   
tributário     637931       tributário   
ambiental      1098559       ambiental   
administrativo 20853    administrativo   

                                                                        1  
0                                                                          
outros         1484423  nao merec ser prov agrav instrument nao conseg...  
constitucional 688167   entend turm sent ser cabivel antecipaca tutel ...  
tributário     637931   mand seguranc objetiv compens integral valor r...  
ambiental      1098559  atu magistr investigaca criminal orga garant l...  
administrativo 

In [4]:
# verificação das classes dos acórdãos
classes = df[0]
print(len(classes.value_counts()))
classes.value_counts().to_frame().style.bar()

10


Unnamed: 0,0
civil,974
outros,974
ambiental,974
constitucional,974
tributário,974
previdenciário,974
processual,974
administrativo,974
penal,974
internacional,974


In [5]:
from sklearn.preprocessing import LabelEncoder

# convertendo classes em binário
encoder = LabelEncoder()
binary = encoder.fit_transform(classes)

print(binary[:10])

[5 3 9 1 0 0 1 9 6 1]


In [6]:
# criação da lista de ementas
ementas = df[1]
print(ementas[:10])

outros          1484423    nao merec ser prov agrav instrument nao conseg...
constitucional  688167     entend turm sent ser cabivel antecipaca tutel ...
tributário      637931     mand seguranc objetiv compens integral valor r...
ambiental       1098559    atu magistr investigaca criminal orga garant l...
administrativo  20853      ausent question previ disposit leg cuj violaca...
                268870     med concurs logr exit impetr ora apel prev vag...
ambiental       959058     pesc lug interdit orga competent apetrech nao ...
tributário      1072201    pretend agrav recurs modific decisa deix conde...
penal           298797     encerr instruca criminal ja tend sid inquir te...
ambiental       925454     compet justic federal process julg crim manute...
Name: 1, dtype: object


In [7]:
from nltk.tokenize import word_tokenize

# criação do BOW (bag of words)
all_words = []

for ementa in ementas:
    words = word_tokenize(ementa)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [8]:
# Exibição do número total de palavras e as 15 mais comuns
print('Número de palavras: {}'.format(len(all_words)))
print('Palavras mais comuns: {}'.format(all_words.most_common(15)))

Número de palavras: 16910
Palavras mais comuns: [('nao', 18870), ('art', 9155), ('lei', 8795), ('n', 8317), ('recurs', 5642), ('ser', 5557), ('prov', 4984), ('dev', 4842), ('federal', 3841), ('part', 3630), ('agrav', 3354), ('direit', 3280), ('pod', 3235), ('public', 3233), ('conhec', 3006)]


In [9]:
# Uso das 1500 palavras mais comuns como características
word_features = list(all_words.keys())[:1500]

In [10]:
# Função determina quais das 1500 palavras características estão contidas nas ementas

def find_features(ementa):
    words = word_tokenize(ementa)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Por exemplo
features = find_features(ementas[0])
for key, value in features.items():
    if value == True:
        print(key)

nao
merec
ser
prov
agrav
instrument
conseg
infirm
fundament
despach
denegatori
process
recurs
revist
conhec
desprov


In [11]:
# Aplicando à todas as ementas
ementas_zip = list(zip(ementas, binary))
zipped_list_1 = ementas_zip[:]
zipped_list_2 = list(ementas_zip) 

# embaralhando lista
seed = 1
np.random.seed = seed
np.random.shuffle(zipped_list_2)

# chamando a função find_features para cada ementa
featuresets = [(find_features(text), label) for (text, label) in ementas_zip]

In [12]:
# Separação dos conjuntos de característas em treinamento e teste usando sklean
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [13]:
print(len(training))
print(len(testing))

7305
2435


In [14]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Modelos para treinar
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, classifier in models:
    nltk_model = SklearnClassifier(classifier)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 28.952772073921974
Decision Tree Accuracy: 47.80287474332649
Random Forest Accuracy: 62.83367556468173




Logistic Regression Accuracy: 57.57700205338809
SGD Classifier Accuracy: 53.38809034907598
Naive Bayes Accuracy: 55.93429158110883
SVM Linear Accuracy: 53.55236139630391


In [15]:
# Classificador por votos
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced'),
    LogisticRegression(solver='lbfgs', multi_class='multinomial'),
    SGDClassifier(max_iter = 100, tol=1e-3),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
zipped_models_1 = models[:]
zipped_models_2 = list(models)


nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = zipped_models_2, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 53.55236139630391


In [16]:
# criação de classe de label de predição para conjunto de teste

txt_features_2, labels_2 = list(zip(*testing))
txt_features_1, labels_1 = txt_features_2[:], labels_2[:]
txt_features, labels = list(txt_features_2), list(labels_2)

prediction = nltk_ensemble.classify_many(txt_features)

In [17]:
# relatório de classificação

print(confusion_matrix(y_test,y_pred))
print(classification_report(labels, prediction))

              precision    recall  f1-score   support

           0       0.42      0.47      0.44       245
           1       0.77      0.81      0.79       263
           2       0.50      0.49      0.49       242
           3       0.59      0.44      0.50       252
           4       0.82      0.77      0.79       257
           5       0.60      0.69      0.64       250
           6       0.66      0.70      0.68       221
           7       0.67      0.71      0.69       240
           8       0.35      0.31      0.33       212
           9       0.58      0.57      0.58       253

    accuracy                           0.60      2435
   macro avg       0.60      0.60      0.59      2435
weighted avg       0.60      0.60      0.60      2435

53.55236139630391
