### Importando bibliotecas

In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pip install pgmpy
import pgmpy
import sklearn
from datetime import date

Note: you may need to restart the kernel to use updated packages.




### Definindo o Dataframe

In [134]:
df = pd.read_csv('republican_democrat.csv', sep = ',')
print(df)

    handicapped-infants water-project-cost-sharing  \
0                     n                          y   
1                     n                          y   
2                     ?                          y   
3                     n                          y   
4                     y                          y   
..                  ...                        ...   
410                   ?                          ?   
411                   y                          n   
412                   y                          y   
413                   y                          y   
414                   n                          y   

    adoption-of-the-budget-resolution physician-fee-freeze  \
0                                   n                    y   
1                                   n                    y   
2                                   y                    ?   
3                                   y                    n   
4                                   y    

### Tratamento de dados
Para melhor manipulação, vamos transformar os 'x' do dataframe em 0, os 'y', em 1, e '?', em 2:

In [135]:
def bool(x):
    if x == '?':
        return 2
    elif x == 'y':
        return 1
    elif x == 'n':
        return 0

# vamos aplicar a todas as colunas exceto 'Target', que não segue a tricotoima n, y e ?:
for col in list(df.drop(['Target'], axis=1)):
    df[col] = df[col].apply(bool)

df

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,Target
0,0,1,0,1,1,0,0,1,2,1,1,1,0,1,republican
1,0,1,0,1,1,0,0,0,0,1,1,1,0,2,republican
2,2,1,1,2,1,0,0,0,1,0,1,1,0,0,democrat
3,0,1,1,0,1,0,0,0,1,0,1,0,0,1,democrat
4,1,1,1,0,1,0,0,0,1,2,1,1,1,1,democrat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,2,2,2,0,0,1,1,1,0,0,1,0,1,1,democrat
411,1,0,1,0,0,1,1,1,0,1,0,2,1,1,democrat
412,1,1,1,0,0,1,1,1,1,0,1,0,0,1,democrat
413,1,1,1,0,0,1,0,1,0,0,0,0,0,1,democrat



### Construção da Rede Bayesiana
Utilizaremos um modelo que assume que os classificadores utilizam distribuições gaussianas.

In [136]:
from sklearn.naive_bayes import GaussianNB

target = df['Target']
inputs = df.drop(['Target'], axis=1)

model = GaussianNB()
model.fit(inputs, target)


GaussianNB()

### Teste da eficácia do modelo:

Construção do dataframe de teste:

In [137]:
df_test = pd.read_csv('republican_democrat_test.csv', sep = ',')


for col in list(df_test.drop(['ID'], axis=1)):
    df_test[col] = df_test[col].apply(bool)

df_test['Prediction'] = model.predict(df_test.drop(['ID'], axis=1))

df_test


Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,ID,Prediction
0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,101,democrat
1,1,1,0,1,1,0,0,1,0,0,1,1,0,1,102,republican
2,0,1,1,0,1,1,2,1,0,0,0,0,0,1,103,democrat
3,1,1,1,1,1,1,0,1,0,0,1,1,0,1,104,republican
4,1,1,1,0,0,1,1,0,0,0,0,0,0,1,105,democrat
5,0,0,0,1,1,1,0,1,0,1,1,1,0,1,106,republican
6,0,1,1,0,1,1,0,0,1,0,1,0,1,1,107,democrat
7,0,0,0,1,1,0,0,0,0,1,1,1,0,0,108,republican
8,0,0,1,0,1,1,1,0,1,0,0,0,1,1,109,democrat
9,0,0,0,1,1,0,0,0,1,1,1,1,0,1,110,republican


Teste da eficácia do modelo:

In [139]:
# Retorna uma matriz com 2 colunas, onde a primeira se refere à probabilidade da pessoa da linha do dataframe ser democrata, e a segunda, ser republicana
proba = model.predict_proba(df_test.drop(['Prediction', 'ID'], axis=1))

# Queremos a 2a probabilidade - ser republicano
proba = [p[1] for p in proba]

df_test['Republican Probability'] = proba

df_test

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,ID,Prediction,Republican Probability
0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,101,democrat,9.171649e-16
1,1,1,0,1,1,0,0,1,0,0,1,1,0,1,102,republican,0.9999199
2,0,1,1,0,1,1,2,1,0,0,0,0,0,1,103,democrat,2.822453e-16
3,1,1,1,1,1,1,0,1,0,0,1,1,0,1,104,republican,0.9214836
4,1,1,1,0,0,1,1,0,0,0,0,0,0,1,105,democrat,9.171649e-16
5,0,0,0,1,1,1,0,1,0,1,1,1,0,1,106,republican,0.9999953
6,0,1,1,0,1,1,0,0,1,0,1,0,1,1,107,democrat,7.996017e-12
7,0,0,0,1,1,0,0,0,0,1,1,1,0,0,108,republican,0.9999999
8,0,0,1,0,1,1,1,0,1,0,0,0,1,1,109,democrat,1.400789e-14
9,0,0,0,1,1,0,0,0,1,1,1,1,0,1,110,republican,0.9999975


### Criação do Ranking
Os ID's serão ranqueados em ordem decrescente por probabilidade de ser republicano

In [164]:
df_test['ID'] = df_test['ID'].astype(str)

ranking = df_test[['ID', 'Republican Probability']].to_numpy()

ranking = sorted(ranking, key = lambda x: x[1])

ranking = [r[0] for r in ranking]

for index, r in enumerate(ranking):
    print("%d° - %s" % (index + 1, r))

1° - 117
2° - 103
3° - 101
4° - 105
5° - 119
6° - 115
7° - 109
8° - 107
9° - 111
10° - 113
11° - 120
12° - 104
13° - 114
14° - 116
15° - 102
16° - 106
17° - 110
18° - 118
19° - 112
20° - 108
