# Probabilistic approach - Bayes' theorem
### This algorithm will take the original database and transform it into a table of probabilistic data, so from that table of probabilities he does the data classification.

### Examples
> Spam filters

> Mining emotions

> Separation of documents

### Laplacian correction
> Usually in the database the probabilities that have a value equal to 0, the fact that this probability value is 0 can cause problems at the time of classification. The solution would be to make a laser correction that will replace these zeroed probabilities with real values so that there is no error during the execution of the algorithm.

In [1]:
import pandas as pd
import numpy as np

# Using the naive bayes algortimo in the credit risk database

In [2]:
dataframe = pd.read_csv('./datasets/risco_credito.csv')
dataframe                    

Unnamed: 0,historia,divida,garantias,renda,c#risco
0,ruim,alta,nenhuma,0_15,alto
1,desconhecida,alta,nenhuma,15_35,alto
2,desconhecida,baixa,nenhuma,15_35,moderado
3,desconhecida,baixa,nenhuma,acima_35,alto
4,desconhecida,baixa,nenhuma,acima_35,baixo
5,desconhecida,baixa,adequada,acima_35,baixo
6,ruim,baixa,nenhuma,0_15,alto
7,ruim,baixa,adequada,acima_35,moderado
8,boa,baixa,nenhuma,acima_35,baixo
9,boa,alta,adequada,acima_35,baixo


### Now we are going to use naive byes with the pre-processing techniques that were previously studied

In [3]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

predicts = dataframe.iloc[:, 0:4].values
classes = dataframe.iloc[:, 4].values

# predict[:,0] = labelencoder.fit_transform(predict[:,0])
# predict[:,1] = labelencoder.fit_transform(predict[:,1])
# predict[:,2] = labelencoder.fit_transform(predict[:,2])
# predict[:,3] = labelencoder.fit_transform(predict[:,3])

vet = [0,1,2,3]

def LabelEncoderAll(vet):
    for i in vet:
        predicts[:, i] = labelencoder.fit_transform(predicts[:, i])
        
LabelEncoderAll(vet)

print(predicts)
print(classes)

[[2 0 1 0]
 [1 0 1 1]
 [1 1 1 1]
 [1 1 1 2]
 [1 1 1 2]
 [1 1 0 2]
 [2 1 1 0]
 [2 1 0 2]
 [0 1 1 2]
 [0 0 0 2]
 [0 0 1 0]
 [0 0 1 1]
 [0 0 1 2]
 [2 0 1 1]]
['alto' 'alto' 'moderado' 'alto' 'baixo' 'baixo' 'alto' 'moderado' 'baixo'
 'baixo' 'alto' 'moderado' 'baixo' 'alto']


In [4]:
from sklearn.naive_bayes import GaussianNB
classifie = GaussianNB()
classifie.fit(predicts, classes)

GaussianNB()

In [5]:
result = classifie.predict([[0,0,1,2], [3, 0, 0, 0]])
print(result)

print(classifie.classes_)
print(classifie.class_count_)
print(classifie.class_prior_)

['baixo' 'moderado']
['alto' 'baixo' 'moderado']
[6. 5. 3.]
[0.42857143 0.35714286 0.21428571]


# Using the naive bayes algortimo in the credit database

In [24]:
# Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [25]:
#loading the dataset
dataframe_credit = pd.read_csv('./datasets/credit_data.csv')
dataframe_credit

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [26]:
#Showing inconsistencies in the dataset
print(dataframe_credit.loc[dataframe_credit['age'] < 0])
print(dataframe_credit.loc[pd.isnull(dataframe_credit['age'])])

    clientid        income        age         loan  default
15        16  50501.726689 -28.218361  3977.287432        0
21        22  32197.620701 -52.423280  4244.057136        0
26        27  63287.038908 -36.496976  9595.286289        0
    clientid        income  age         loan  default
28        29  59417.805406  NaN  2082.625938        0
30        31  48528.852796  NaN  6155.784670        0
31        32  23526.302555  NaN  2862.010139        0


In [27]:
# Replacing inconsistent values
mean_df = dataframe_credit['age'][dataframe_credit.age > 0].mean()
print(mean_df)

dataframe_credit.loc[dataframe_credit.age < 0, 'age'] = mean_df

40.92770044906149


In [28]:
# Separating the prediction data from the classes.
predicts = dataframe_credit.iloc[:, 1:4].values
classes = dataframe_credit.iloc[:, 4].values

In [29]:
# Replacing inconsistent values
imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
imputer = imputer.fit(predicts[:, 0:4])
predicts[:, 0:4] = imputer.transform(predicts[:, 0:4])

In [30]:
# Showing that there is no more NaN data.
print(dataframe.loc[dataframe['age'] < 0])
list(map(tuple, np.where(np.isnan(predicts))))

Empty DataFrame
Columns: [clientid, income, age, loan, default]
Index: []


[(), ()]

In [31]:
# Standardization / Standardization of data.
scaler = StandardScaler()
predicts = scaler.fit_transform(predicts)
print(predicts)

[[ 1.45393393  1.36538005  1.20281942]
 [-0.76217555  0.54265932  0.69642695]
 [ 0.83682073  1.67417101  1.17471147]
 ...
 [-0.07122592 -0.97448606  0.35420081]
 [-0.11000289  1.73936652 -0.92675625]
 [ 1.682986    1.14917551  0.96381038]]


In [32]:
# Separating data for training and testing.
predict_training, predict_test, classes_training, classes_test = train_test_split(predicts, classes, test_size=0.25, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

(1500, 3) (500, 3) (1500,) (500,)


In [33]:
# Using Naive Bayes
classifie = GaussianNB()
classifie.fit(predict_training, classes_training)

GaussianNB()

In [34]:
# Predicting test data.
prevision = classifie.predict(predict_test)
prevision

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [35]:
# confusion_matrix and accuracy_score will help to see the accuracy of the algorithm in that database.
from sklearn.metrics import confusion_matrix, accuracy_score

In [36]:
precision = accuracy_score(classes_test, prevision)
precision

0.938

In [37]:
matrix = confusion_matrix(classes_test, prevision)
matrix

array([[428,   8],
       [ 23,  41]], dtype=int64)

# Using the naive bayes algortimo in the census database

In [57]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [58]:
#loading the dataset
dataframe_census = pd.read_csv('./datasets/census.csv')
dataframe_census

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [60]:
# Separating the prediction data from the classes.
predicts = dataframe_census.iloc[:, 0:14].values
classes = dataframe_census.iloc[:, 14].values

#Showing prediction data and classes before treatment.
print('-------------- predict --------------')
print(predicts)
print('-------------- classes --------------')
print(classes)

-------------- predict --------------
[[39 ' State-gov' 77516 ... 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ... 0 13 ' United-States']
 [38 ' Private' 215646 ... 0 40 ' United-States']
 ...
 [58 ' Private' 151910 ... 0 40 ' United-States']
 [22 ' Private' 201490 ... 0 20 ' United-States']
 [52 ' Self-emp-inc' 287927 ... 0 40 ' United-States']]
-------------- classes --------------
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


In [61]:
#Treating the data, transforming categorical variables into numeric ones.
column_tranformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])],remainder='passthrough')
predicts = column_tranformer.fit_transform(predicts).toarray()

labelencoder_classes = LabelEncoder()
classes = labelencoder_classes.fit_transform(classes)

scaler = StandardScaler()
predict = scaler.fit_transform(predicts)

#Showing prediction data and classes after treatment.
print('-------------- predict --------------')
print(predicts)
print('-------------- classes --------------')
print(classes)

-------------- predict --------------
[[0.0000e+00 0.0000e+00 0.0000e+00 ... 2.1740e+03 0.0000e+00 4.0000e+01]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 1.3000e+01]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 4.0000e+01]
 ...
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 4.0000e+01]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 2.0000e+01]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 1.5024e+04 0.0000e+00 4.0000e+01]]
-------------- classes --------------
[0 0 0 ... 0 0 1]


In [52]:
# Separating data for training and testing.
predict_training, predict_test, classes_training, classes_test = train_test_split(predict, classes, test_size=0.15, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

(27676, 108) (4885, 108) (27676,) (4885,)


In [53]:
# Using Naive Bayes
classifie = GaussianNB()
classifie.fit(predict_training, classes_training)

GaussianNB()

In [62]:
# Predicting test data.
prevision = classifie.predict(predict_test)
prevision

array([1, 1, 1, ..., 0, 1, 1])

In [64]:
# confusion_matrix and accuracy_score will help to see the accuracy of the algorithm in that database.
precision = accuracy_score(classes_test, prevision)
print(precision)

matrix = confusion_matrix(classes_test, prevision)
print(matrix)

0.4767656090071648
[[1172 2521]
 [  35 1157]]


# Using the naive bayes algortimo in the census database without pre processing

In [13]:
# Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [14]:
#loading the dataset
dataframe_census = pd.read_csv('./datasets/census.csv')
dataframe_census

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [15]:
# Separating the prediction data from the classes.
predicts = dataframe_census.iloc[:, 0:14].values
classes = dataframe_census.iloc[:, 14].values

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder_predict = LabelEncoder()

#Treating the data, transforming categorical variables into numeric ones.
labels = np.array([1,3,5,6,7,8,9,13])
def LabelEncoderAll(vet):
    for i in vet:
        predicts[:, i] = labelencoder_predict.fit_transform(predicts[:, i])
        
LabelEncoderAll(labels)
predicts

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [8]:
# Separating data for training and testing.
predict_training, predict_test, classes_training, classes_test = train_test_split(predicts, classes, test_size=0.15, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

(27676, 14) (4885, 14) (27676,) (4885,)


In [9]:
# Using Naive Bayes
classifie = GaussianNB()
classifie.fit(predict_training, classes_training)

GaussianNB()

In [10]:
# Predicting test data.
prevision = classifie.predict(predict_test)
prevision

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype='<U6')

In [12]:
# confusion_matrix and accuracy_score will help to see the accuracy of the algorithm in that database.
from sklearn.metrics import confusion_matrix, accuracy_score
precision = accuracy_score(classes_test, prevision)
print(precision)

matrix = confusion_matrix(classes_test, prevision)
print(matrix)

0.7952917093142272
[[3516  177]
 [ 823  369]]


### Not always doing pre-processing can bring us better results, as we can see in the example with and without pre-processing, the "without" turned out to be more effective.