# Probabilistic approach - Bayes' theorem
### This algorithm will take the original database and transform it into a table of probabilistic data, so from that table of probabilities he does the data classification.

### Examples
> Spam filters

> Mining emotions

> Separation of documents

### Laplacian correction
> Usually in the database the probabilities that have a value equal to 0, the fact that this probability value is 0 can cause problems at the time of classification. The solution would be to make a laser correction that will replace these zeroed probabilities with real values so that there is no error during the execution of the algorithm.

In [1]:
import pandas as pd
import numpy as np

# Using the naive bayes algortimo in the credit risk database

In [2]:
dataframe = pd.read_csv('./datasets/risco_credito.csv')
dataframe                    

Unnamed: 0,historia,divida,garantias,renda,c#risco
0,ruim,alta,nenhuma,0_15,alto
1,desconhecida,alta,nenhuma,15_35,alto
2,desconhecida,baixa,nenhuma,15_35,moderado
3,desconhecida,baixa,nenhuma,acima_35,alto
4,desconhecida,baixa,nenhuma,acima_35,baixo
5,desconhecida,baixa,adequada,acima_35,baixo
6,ruim,baixa,nenhuma,0_15,alto
7,ruim,baixa,adequada,acima_35,moderado
8,boa,baixa,nenhuma,acima_35,baixo
9,boa,alta,adequada,acima_35,baixo


### Now we are going to use naive byes with the pre-processing techniques that were previously studied

In [3]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

predicts = dataframe.iloc[:, 0:4].values
classes = dataframe.iloc[:, 4].values

# predict[:,0] = labelencoder.fit_transform(predict[:,0])
# predict[:,1] = labelencoder.fit_transform(predict[:,1])
# predict[:,2] = labelencoder.fit_transform(predict[:,2])
# predict[:,3] = labelencoder.fit_transform(predict[:,3])

vet = [0,1,2,3]

def LabelEncoderAll(vet):
    for i in vet:
        predicts[:, i] = labelencoder.fit_transform(predicts[:, i])
        
LabelEncoderAll(vet)

print(predicts)
print(classes)

[[2 0 1 0]
 [1 0 1 1]
 [1 1 1 1]
 [1 1 1 2]
 [1 1 1 2]
 [1 1 0 2]
 [2 1 1 0]
 [2 1 0 2]
 [0 1 1 2]
 [0 0 0 2]
 [0 0 1 0]
 [0 0 1 1]
 [0 0 1 2]
 [2 0 1 1]]
['alto' 'alto' 'moderado' 'alto' 'baixo' 'baixo' 'alto' 'moderado' 'baixo'
 'baixo' 'alto' 'moderado' 'baixo' 'alto']


In [4]:
from sklearn.naive_bayes import GaussianNB
classifie = GaussianNB()
classifie.fit(predicts, classes)

GaussianNB()

In [5]:
result = classifie.predict([[0,0,1,2], [3, 0, 0, 0]])
print(result)

print(classifie.classes_)
print(classifie.class_count_)
print(classifie.class_prior_)

['baixo' 'moderado']
['alto' 'baixo' 'moderado']
[6. 5. 3.]
[0.42857143 0.35714286 0.21428571]


# Using the naive bayes algortimo in the credit database

In [11]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [12]:
dataframe = pd.read_csv('./datasets/credit_data.csv')
dataframe

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [13]:
print(dataframe.loc[dataframe['age'] < 0])
print(dataframe.loc[pd.isnull(dataframe['age'])])

    clientid        income        age         loan  default
15        16  50501.726689 -28.218361  3977.287432        0
21        22  32197.620701 -52.423280  4244.057136        0
26        27  63287.038908 -36.496976  9595.286289        0
    clientid        income  age         loan  default
28        29  59417.805406  NaN  2082.625938        0
30        31  48528.852796  NaN  6155.784670        0
31        32  23526.302555  NaN  2862.010139        0


In [14]:
mean_df = dataframe['age'][dataframe.age > 0].mean()
print(mean_df)

dataframe.loc[dataframe.age < 0, 'age'] = mean_df

40.92770044906149


In [15]:
predicts = dataframe.iloc[:, 1:4].values
classes = dataframe.iloc[:, 4].values

In [16]:
imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
imputer = imputer.fit(predicts[:, 0:4])
predicts[:, 0:4] = imputer.transform(predicts[:, 0:4])

In [17]:
print(dataframe.loc[dataframe['age'] < 0])
list(map(tuple, np.where(np.isnan(predicts))))

Empty DataFrame
Columns: [clientid, income, age, loan, default]
Index: []


[(), ()]

In [18]:
scaler = preprocessing.StandardScaler()
predicts = scaler.fit_transform(predicts)
print(predicts)

[[ 1.45393393  1.36538005  1.20281942]
 [-0.76217555  0.54265932  0.69642695]
 [ 0.83682073  1.67417101  1.17471147]
 ...
 [-0.07122592 -0.97448606  0.35420081]
 [-0.11000289  1.73936652 -0.92675625]
 [ 1.682986    1.14917551  0.96381038]]


In [19]:
predict_training, predict_test, classes_training, classes_test = train_test_split(predicts, classes, test_size=0.25, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

(1500, 3) (500, 3) (1500,) (500,)


In [20]:
classifie = GaussianNB()
classifie.fit(predict_training, classes_training)

GaussianNB()

In [21]:
prevision = classifie.predict(predict_test)
prevision

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,