In [36]:
import pandas as pd
import numpy as np
dataframe = pd.read_csv('./datasets/credit_data.csv')

In [37]:
dataframe.describe()

Unnamed: 0,idclientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [38]:
'''
As we can see at an age with negative values and that is incorrect, through the loc function, 
we can find these lines with the wrong values.
'''
dataframe.loc[dataframe['age'] < 0]

Unnamed: 0,idclientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [39]:
'''
How to solve this problem

In this way we will exclude the entire age column, which is not a good solution.
dataframe.drop('age', 1, inplace=True)

And this way we will exclude only the lines with problematic values.
dataframe.drop(dataframe[dataframe.age < 0].index, inplace=True)
'''
dataframe.drop(dataframe[dataframe.age < 0].index, inplace=True)
dataframe.loc[dataframe['age'] < 0]

Unnamed: 0,idclientid,income,age,loan,default


In [40]:
'''
Loading the dataset again to return the previously removed negative values
'''
dataframe = pd.read_csv('./datasets/credit_data.csv')
print('negative age values')
print(dataframe.loc[dataframe['age'] < 0])

'''
Another solution would be to replace the age values of these people who are negative,
for the mean of all ages present in the data set (excluding negative values from the mean)
'''
print('\n')
print('Average of values WITH and WITHOUT negative values')
print(dataframe['age'].mean())
print(dataframe['age'][dataframe.age > 0].mean())
mean_df = dataframe['age'][dataframe.age > 0].mean()
dataframe.loc[dataframe.age < 0, 'age'] = mean_df
print('\n')
print('Showing only the values in which they were negative and are now with the DF average value')
print(dataframe.loc[dataframe['age'] == mean_df])

negative age values
    idclientid        income        age         loan  default
15          16  50501.726689 -28.218361  3977.287432        0
21          22  32197.620701 -52.423280  4244.057136        0
26          27  63287.038908 -36.496976  9595.286289        0


Average of values WITH and WITHOUT negative values
40.80755937840458
40.92770044906149


Showing only the values in which they were negative and are now with the DF average value
    idclientid        income      age         loan  default
15          16  50501.726689  40.9277  3977.287432        0
21          22  32197.620701  40.9277  4244.057136        0
26          27  63287.038908  40.9277  9595.286289        0


In [88]:
'''
Loading the dataset again to return the previously removed negative values
'''
dataframe = pd.read_csv('./datasets/credit_data.csv')
print('negative age values')
print(dataframe.loc[dataframe['age'] < 0])
'''
Now an idea that I had that the course teacher did not give, 
which consists of taking each of the values and multiplying them by -1
'''
indexs = dataframe.loc[dataframe.age < 0].index
ages = dataframe.loc[dataframe.age < 0]['age']
for i in indexs:
    dataframe.loc[dataframe.index == i, 'age'] = ages[i] * (-1)
    #print(dataframe.loc[dataframe.index == i])

negative age values
    idclientid        income        age         loan  default
15          16  50501.726689 -28.218361  3977.287432        0
21          22  32197.620701 -52.423280  4244.057136        0
26          27  63287.038908 -36.496976  9595.286289        0
