In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Fill Missing Data Example

In [6]:
#Dataset as a dataframe: create a set of lists
DataFrame = pd.DataFrame({'team':['A','A','A','A', np.nan, 'B', 'B', 'B'], 'position':['G','G','F','G','F','G','C','C'], 
                          'age':[21,22,21.5,50,60,np.nan,46,70],
                          'salary':[5000, 10000, 4000, 3000, 6000, 1000, 2000, 8000]})

In [7]:
#Numerical Features printed as a list of features
Numerical_Features = DataFrame.select_dtypes(exclude = ['object']).columns.tolist()
print(Numerical_Features)

['age', 'salary']


In [9]:
#printed as a list
dataframe_n = DataFrame[Numerical_Features]
print(dataframe_n)

    age  salary
0  21.0    5000
1  22.0   10000
2  21.5    4000
3  50.0    3000
4  60.0    6000
5   NaN    1000
6  46.0    2000
7  70.0    8000


In [10]:
#Now we fill in the missing data
imp_mean = SimpleImputer(missing_values = np.nan , strategy = 'mean')
imp_mean.fit(dataframe_n)
dataframe_n = imp_mean.transform(dataframe_n)
print(dataframe_n)

[[   21.   5000. ]
 [   22.  10000. ]
 [   21.5  4000. ]
 [   50.   3000. ]
 [   60.   6000. ]
 [   41.5  1000. ]
 [   46.   2000. ]
 [   70.   8000. ]]


In [12]:
#Categorical Features
Categorical_Features = DataFrame.select_dtypes(include = ['object']).columns.tolist()
print(Categorical_Features)

['team', 'position']


In [13]:
dataframe_c = DataFrame[Categorical_Features]
print(dataframe_c)

  team position
0    A        G
1    A        G
2    A        F
3    A        G
4  NaN        F
5    B        G
6    B        C
7    B        C


In [15]:
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imp_mean.fit(dataframe_c)
dataframe_c = imp_mean.transform(dataframe_c)
print(dataframe_c)

[['A' 'G']
 ['A' 'G']
 ['A' 'F']
 ['A' 'G']
 ['A' 'F']
 ['B' 'G']
 ['B' 'C']
 ['B' 'C']]


In [16]:
DataFrame[Numerical_Features] = dataframe_n
DataFrame[Categorical_Features] = dataframe_c

# Data Transformation

## Discretisation

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

In [18]:
DataFrame2 = pd.DataFrame({'team':['A','A','A','A', 'B', 'B', 'B', 'B'], 'position':['G','G','F','G','F','G','C','C'], 
                          'age':[21,22,21.5,50,60,40,46,70],
                          'salary':[5000, 10000, 4000, 3000, 6000, 1000, 2000, 8000]})

In [39]:
Categorical_Features_2 = DataFrame2.select_dtypes(exclude = ['object']).columns.tolist()
dataframe_c_2 = DataFrame2[Categorical_Features_2]
print(dataframe_c_2)

    age  salary
0  21.0    5000
1  22.0   10000
2  21.5    4000
3  50.0    3000
4  60.0    6000
5  40.0    1000
6  46.0    2000
7  70.0    8000


In [40]:
est = KBinsDiscretizer(n_bins = 3, strategy = 'uniform', encode = 'ordinal')
c = est.fit(dataframe_c_2)
print(c.bin_edges_)

[array([21.        , 37.33333333, 53.66666667, 70.        ])
 array([ 1000.,  4000.,  7000., 10000.])]


In [41]:
dataframe_c_2 = est.transform(dataframe_c_2)
print(dataframe_c_2)

[[0. 1.]
 [0. 2.]
 [0. 1.]
 [1. 0.]
 [2. 1.]
 [1. 0.]
 [1. 0.]
 [2. 2.]]


## Normalisation

In [28]:
from sklearn.preprocessing import MinMaxScaler

In [29]:
DataFrame2 = pd.DataFrame({'team':['A','A','A','A', 'B', 'B', 'B', 'B'], 'position':['G','G','F','G','F','G','C','C'], 
                          'age':[21,22,21.5,50,60,40,46,70],
                          'salary':[5000, 10000, 4000, 3000, 6000, 1000, 2000, 8000]})

In [35]:
Numerical_Features_2 = DataFrame2.select_dtypes(exclude = ['object']).columns.tolist()
dataframe_n_2 = DataFrame2[Numerical_Features_2]
print(dataframe_n_2)

    age  salary
0  21.0    5000
1  22.0   10000
2  21.5    4000
3  50.0    3000
4  60.0    6000
5  40.0    1000
6  46.0    2000
7  70.0    8000


In [36]:
normaliser = MinMaxScaler(feature_range = (0,1))
norm_data = normaliser.fit_transform(dataframe_n_2)
print(norm_data)

[[0.         0.44444444]
 [0.02040816 1.        ]
 [0.01020408 0.33333333]
 [0.59183673 0.22222222]
 [0.79591837 0.55555556]
 [0.3877551  0.        ]
 [0.51020408 0.11111111]
 [1.         0.77777778]]


In [44]:
DataFrame2[Numerical_Features_2] = norm_data
print(DataFrame2)

  team position       age    salary
0    A        G  0.000000  0.444444
1    A        G  0.020408  1.000000
2    A        F  0.010204  0.333333
3    A        G  0.591837  0.222222
4    B        F  0.795918  0.555556
5    B        G  0.387755  0.000000
6    B        C  0.510204  0.111111
7    B        C  1.000000  0.777778


# HANDS ON

In [None]:
import pandas as pd
dataset = pd.read_csv(r'')
print(dataset)