In [10]:
import pandas as pd
import numpy as np

# Treatment of inconsistent values

In [11]:
dataframe = pd.read_csv('./datasets/credit_data.csv')

In [12]:
dataframe.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [13]:
'''
As we can see at an age with negative values and that is incorrect, through the loc function, 
we can find these lines with the wrong values.
'''
dataframe.loc[dataframe['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [14]:
'''
How to solve this problem.

In this way we will exclude the entire age column, which is not a good solution.
dataframe.drop('age', 1, inplace=True)

And this way we will exclude only the lines with problematic values.
dataframe.drop(dataframe[dataframe.age < 0].index, inplace=True).
'''
dataframe.drop(dataframe[dataframe.age < 0].index, inplace=True)
dataframe.loc[dataframe['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default


In [15]:
'''
Loading the dataset again to return the previously removed negative values.
'''
dataframe = pd.read_csv('./datasets/credit_data.csv')
print('negative age values')
print(dataframe.loc[dataframe['age'] < 0])

'''
Another solution would be to replace the age values of these people who are negative,
for the mean of all ages present in the data set (excluding negative values from the mean).
'''
print('\n')
print('Average of values WITH and WITHOUT negative values')
print(dataframe['age'].mean())
print(dataframe['age'][dataframe.age > 0].mean())
mean_df = dataframe['age'][dataframe.age > 0].mean()
dataframe.loc[dataframe.age < 0, 'age'] = mean_df
print('\n')
print('Showing only the values in which they were negative and are now with the DF average value')
print(dataframe.loc[dataframe['age'] == mean_df])

negative age values
    clientid        income        age         loan  default
15        16  50501.726689 -28.218361  3977.287432        0
21        22  32197.620701 -52.423280  4244.057136        0
26        27  63287.038908 -36.496976  9595.286289        0


Average of values WITH and WITHOUT negative values
40.80755937840458
40.92770044906149


Showing only the values in which they were negative and are now with the DF average value
    clientid        income      age         loan  default
15        16  50501.726689  40.9277  3977.287432        0
21        22  32197.620701  40.9277  4244.057136        0
26        27  63287.038908  40.9277  9595.286289        0


In [16]:
'''
Loading the dataset again to return the previously removed negative values.
'''
dataframe = pd.read_csv('./datasets/credit_data.csv')
print('negative age values')
print(dataframe.loc[dataframe['age'] < 0])
'''
Now an idea that I had that the course teacher did not give, 
which consists of taking each of the values and multiplying them by -1.
'''
indexs = dataframe.loc[dataframe.age < 0].index
ages = dataframe.loc[dataframe.age < 0]['age']
print('\n')
print('Dataframe with negative age values multiplied by -1')
for i in indexs:
    dataframe.loc[dataframe.index == i, 'age'] = ages[i] * (-1)
    print(dataframe.loc[dataframe.index == i])

negative age values
    clientid        income        age         loan  default
15        16  50501.726689 -28.218361  3977.287432        0
21        22  32197.620701 -52.423280  4244.057136        0
26        27  63287.038908 -36.496976  9595.286289        0


Dataframe with negative age values multiplied by -1
    clientid        income        age         loan  default
15        16  50501.726689  28.218361  3977.287432        0
    clientid        income       age         loan  default
21        22  32197.620701  52.42328  4244.057136        0
    clientid        income        age         loan  default
26        27  63287.038908  36.496976  9595.286289        0


# Handling of missing values

In [17]:
dataframe = pd.read_csv('./datasets/credit_data.csv')
dataframe.loc[pd.isnull(dataframe['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [18]:
'''
We can see that there are ages with missing values so we need to make a certain type of prediction to know 
what the age of these people could be.
'''
predict = dataframe.iloc[:, 1:4].values
print('-------------- predict --------------')
print(predict)
classes = dataframe.iloc[:, 4].values
print('\n')
print('-------------- classes --------------')
print(classes)

-------------- predict --------------
[[6.61559251e+04 5.90170151e+01 8.10653213e+03]
 [3.44151540e+04 4.81171531e+01 6.56474502e+03]
 [5.73171701e+04 6.31080495e+01 8.02095330e+03]
 ...
 [4.43114493e+04 2.80171669e+01 5.52278669e+03]
 [4.37560566e+04 6.39717958e+01 1.62272260e+03]
 [6.94365796e+04 5.61526170e+01 7.37883360e+03]]


-------------- classes --------------
[0 0 0 ... 1 0 0]


In [19]:
#In this way we will remove the NaN values from our predictions
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
imputer = imputer.fit(predict[:, 0:3])
predict[:, 0:3] = imputer.transform(predict[:, 0:3])

#Checking if there are still nan values inside the predicts
list(map(tuple, np.where(np.isnan(predict))))

[(), ()]

# Scaling of attributes
### Padronização (Standardisation)
<img src="https://latex.codecogs.com/png.latex?x&space;=&space;\frac{x&space;-&space;mean(x)}{standard&space;deviation}" title="x = \frac{x - mean(x)}{standard deviation}" />

### Normalização (Normalization)
<img src="https://latex.codecogs.com/png.latex?x&space;=&space;\frac{x&space;-&space;minimum(x)}{maximum(x)&space;-&space;minimum(x)}" title="x = \frac{x - minimum(x)}{maximum(x) - minimum(x)}" />

In [20]:
'''
Sometimes a very large difference between the values can cause problems in the execution of the algorithm 
(ex: the income value is much greater than the age value), a way to get around this problem would be to scale the values, 
this way the values will have the same importance for the algorithms.
'''
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(predict)
predict = scaler.transform(predict)
predict

array([[ 1.45393393,  1.33786439,  1.20281942],
       [-0.76217555,  0.53704215,  0.69642695],
       [ 0.83682073,  1.63843621,  1.17471147],
       ...,
       [-0.07122592, -0.93972115,  0.35420081],
       [-0.11000289,  1.7018964 , -0.92675625],
       [ 1.682986  ,  1.1274146 ,  0.96381038]])

In [21]:
'''
Now after learning some pre-processing techniques, we will work with more complex datasets taken from the site:
https://archive.ics.uci.edu/ml/index.php

More specifically this dataset:
https://archive.ics.uci.edu/ml/datasets/Adult
'''
dataframe_adult = pd.read_csv('./datasets/census.csv')
print(dataframe_adult.shape)
dataframe_adult

(32561, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# Transformation of categorical variables

In [22]:
predict_adult = dataframe_adult.iloc[:,0:14].values
print('-------------- predict --------------')
print(predict_adult)
classes_adult = dataframe_adult.iloc[:, 14].values
print('\n')
print('-------------- classes --------------')
print(classes_adult)

-------------- predict --------------
[[39 ' State-gov' 77516 ... 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ... 0 13 ' United-States']
 [38 ' Private' 215646 ... 0 40 ' United-States']
 ...
 [58 ' Private' 151910 ... 0 40 ' United-States']
 [22 ' Private' 201490 ... 0 20 ' United-States']
 [52 ' Self-emp-inc' 287927 ... 0 40 ' United-States']]


-------------- classes --------------
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


In [23]:
from sklearn.preprocessing import LabelEncoder
labelencoder_predict = LabelEncoder()
'''
predict_adult[:, 1] = labelencoder_predict.fit_transform(predict_adult[:, 1])
predict_adult[:, 3] = labelencoder_predict.fit_transform(predict_adult[:, 3])
predict_adult[:, 5] = labelencoder_predict.fit_transform(predict_adult[:, 5])
predict_adult[:, 6] = labelencoder_predict.fit_transform(predict_adult[:, 6])
predict_adult[:, 7] = labelencoder_predict.fit_transform(predict_adult[:, 7])
predict_adult[:, 8] = labelencoder_predict.fit_transform(predict_adult[:, 8])
predict_adult[:, 9] = labelencoder_predict.fit_transform(predict_adult[:, 9])
predict_adult[:, 13] = labelencoder_predict.fit_transform(predict_adult[:, 13])
'''
labels = np.array([1,3,5,6,7,8,9,13])
def LabelEncoderAll(vet):
    for i in vet:
        predict_adult[:, i] = labelencoder_predict.fit_transform(predict_adult[:, i])
        
LabelEncoderAll(labels)
predict_adult

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [24]:
'''
We have that the race attribute how to EXAMPLE, after the past transformation had its values changed to numerical variables, 
but because there is no better race than the other, we need to do another transformation.
This transformation will not let the algorithm understand that because a race has a higher value, 
it does not mean that it is better, they all have the same value.
'''
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Races column 
transformed_predict_adult = predict_adult[:, 8:9]
print(transformed_predict_adult)
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0])],remainder='passthrough')
transformed_predict_adult = onehotencorder.fit_transform(transformed_predict_adult).toarray()
print('-------------- before --------------')
print(predict_adult[:, 8:9])
print('-------------- after --------------')
print(transformed_predict_adult)

[[4]
 [4]
 [4]
 ...
 [4]
 [4]
 [4]]
-------------- before --------------
[[4]
 [4]
 [4]
 ...
 [4]
 [4]
 [4]]
-------------- after --------------
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [25]:
'''
Now doing for all other values.
'''
print(predict_adult.shape)
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7,8,9,13])],remainder='passthrough')
predict_adult = onehotencorder.fit_transform(predict_adult).toarray()
print(predict_adult.shape)

(32561, 14)
(32561, 108)


In [26]:
'''
Now we just need to transform our classes
'''
labelencorder_classes = LabelEncoder()
classes_adult = labelencorder_classes.fit_transform(classes_adult)
classes_adult

array([0, 0, 0, ..., 0, 0, 1])

# Scaling of attributes

In [27]:
'''
In EXAMPLE KNN algorithms, which use Euclidean distance, scaling the data will make your algorithm run much faster, 
that is, better performance.
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
predict_adult = scaler.fit_transform(predict_adult)
predict_adult

array([[-0.2444502 , -0.17429511, -0.26209736, ...,  0.1484529 ,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -2.22215312],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       ...,
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -1.65522476],
       [-0.2444502 , -0.17429511, -0.26209736, ...,  1.88842434,
        -0.21665953, -0.03542945]])

# Credit dataset split for testing and training

In [28]:
'''
Now after the data pre-processing studies we will see in practice how to make use of this database.

Now let's import the database from 0, do the correct treatments for that database and data again and divide the data 
for training and testing.
'''
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

dataframe = pd.read_csv('./datasets/credit_data.csv')

#Removing the negative data from the dataset and replacing them with the average.
df_mean = dataframe['age'][dataframe.age > 0].mean()
print(df_mean)
dataframe.loc[dataframe['age'] < 0] = df_mean

#Separating input data from classes
predict = dataframe.iloc[:, 1:4].values
classes = dataframe.iloc[:, 4].values

#Removing the NaN data from the dataset and replacing them with the average.
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(predict[:, 1:4])
predict[:, 1:4] = imputer.transform(predict[:, 1:4])

#Now separating the data into training and test data.
predict_training, predict_test, classes_training, classes_test = train_test_split(predict, classes, test_size=0.25, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

40.92770044906149
(1500, 3) (500, 3) (1500,) (500,)


# Census dataset split for testing and training

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

dataframe = pd.read_csv('./datasets/census.csv')
print('shape of dataset',dataframe.shape)

#Separating input data from classes
predict = dataframe.iloc[:, 0:14].values
classes = dataframe.iloc[:, 14].values
#Properly transforming the dataset
column_tranformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])],remainder='passthrough')
predict = column_tranformer.fit_transform(predict).toarray()

labelencoder_classes = LabelEncoder()
classes = labelencoder_classes.fit_transform(classes)

scaler = StandardScaler()
predict = scaler.fit_transform(predict)

#Now separating the data into training and test data.
predict_training, predict_test, classes_training, classes_test = train_test_split(predict, classes, test_size=0.15, random_state=0)
print(predict_training.shape, predict_test.shape, classes_training.shape, classes_test.shape)

shape of dataset (32561, 15)
(27676, 108) (4885, 108) (27676,) (4885,)
