In [1]:
bold = '\033[1m'
cbold = '\033[0m'
print(bold+'Data Processing Basics Notebook'+cbold)
print('Note: All the data-processing transformations must be applied to'+bold+' training set only'+cbold+' though this notebook has applied it to complete dataset.')
print('Some transformations are required for test set too, eg: NaN value replacement, whereas some must strictly not be applied to test set, eg: featue scaling.')

[1mData Processing Basics Notebook[0m
Note: All the data-processing transformations must be applied to[1m training set only[0m though this notebook has applied it to complete dataset.
Some transformations are required for test set too, eg: NaN value replacement, whereas some must strictly not be applied to test set, eg: featue scaling.


In [2]:
import pandas as pd
import numpy as np
import sklearn as sk

In [3]:
path = '/home/ubuntu/IntroToML/housing.csv'
path2 = '/home/ubuntu/Machine-Learning-Algorithms/titanic_test.csv'
dataset = pd.read_csv(path)
print('Preview of dataset:\n')
print(dataset.head())

Preview of dataset:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [4]:
print("Some general analysis of dataset: \n")
print(dataset.info())
print('********************************************************************************\n')
print(dataset.describe())

Some general analysis of dataset: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
********************************************************************************

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20

In [5]:
print(bold+"Data Cleaning(Numerical data)"+cbold)

[1mData Cleaning(Numerical data)[0m


In [6]:
print('Using dropna() to drop samples with NaN:')
print('Dummy dataset size before dropna():', dataset.shape)
dataset.dropna(inplace=True,how='any')#drops all the samples with atleast one NaN value
print('Dummy dataset size after dropna():', dataset.shape)

dataset = pd.read_csv(path)#reloading the data
print('Dummy dataset size before dropna():', dataset.shape)
dataset.dropna(inplace=True,how='all')#drops all the samples with all values as NaN
print('\n\nDummy dataset size before dropna():', dataset.shape)
print('Dummy dataset size after dropna():', dataset.shape)

Using dropna() to drop samples with NaN:
Dummy dataset size before dropna(): (20640, 10)
Dummy dataset size after dropna(): (20433, 10)
Dummy dataset size before dropna(): (20640, 10)


Dummy dataset size before dropna(): (20640, 10)
Dummy dataset size after dropna(): (20640, 10)


In [7]:
print("Using drop() to drop features and samples:")
print('\nDropping features:\n')
dataset = pd.read_csv(path)
print("Colunms before drop():\n",dataset.columns)
dataset.drop(['total_rooms','ocean_proximity'],inplace=True,axis=1)#drops off the columns passed as subset parameter
#axis=1 indicates that the elements to be dropped are columns
print("\nColumns after drop():\n",dataset.columns)

print("\n\nDropping samples:")
dataset = pd.read_csv(path)
print("\nSamples before drop():\n ",dataset.iloc[0:5,1:4])
dataset.drop([1,2,3],inplace=True)#drops off the given samples i.e. sample 1,2 and 3
print("\nSamples after drop():\n",dataset.iloc[0:5,1:4])

Using drop() to drop features and samples:

Dropping features:

Colunms before drop():
 Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

Columns after drop():
 Index(['longitude', 'latitude', 'housing_median_age', 'total_bedrooms',
       'population', 'households', 'median_income', 'median_house_value'],
      dtype='object')


Dropping samples:

Samples before drop():
     latitude  housing_median_age  total_rooms
0     37.88                41.0        880.0
1     37.86                21.0       7099.0
2     37.85                52.0       1467.0
3     37.85                52.0       1274.0
4     37.85                52.0       1627.0

Samples after drop():
    latitude  housing_median_age  total_rooms
0     37.88                41.0        880.0
4     37.85                52.0       1627.0
5     37.85                52.0   

In [8]:
dataset = pd.read_csv(path)
print('Replacing NaN with a constant value:')
print("Columns containing NaN values before fillna(): ",dataset.columns[dataset.isnull().any()].tolist())
dataset.fillna(0,inplace=True)#replacing all Nan values with a constant value 0
print("Columns containing NaN values after fillna(): ",dataset.columns[dataset.isnull().any()].tolist())

dataset = pd.read_csv(path)
print('\n\nReplacing NaN with the mean:')
print("Columns containing NaN values before fillna(): ",dataset.columns[dataset.isnull().any()].tolist())
dataset.fillna({
    'total_bedrooms':dataset['total_bedrooms'].mean()#replaces the NaN with mean value
},inplace=True)
print("Columns containing NaN values after fillna(): ",dataset.columns[dataset.isnull().any()].tolist())

dataset = pd.read_csv(path)
print('\n\nUsing methods of fillna like method=(ffill/bfill/etc)')
print("Columns containing NaN values before fillna(): ",dataset.columns[dataset.isnull().any()].tolist())
dataset.fillna(method='ffill',inplace=True)#using method parameter of filllna
print("Columns containing NaN values after fillna(): ",dataset.columns[dataset.isnull().any()].tolist())

Replacing NaN with a constant value:
Columns containing NaN values before fillna():  ['total_bedrooms']
Columns containing NaN values after fillna():  []


Replacing NaN with the mean:
Columns containing NaN values before fillna():  ['total_bedrooms']
Columns containing NaN values after fillna():  []


Using methods of fillna like method=(ffill/bfill/etc)
Columns containing NaN values before fillna():  ['total_bedrooms']
Columns containing NaN values after fillna():  []


In [9]:
print('Using Imputer to handle NaN values: ')

dataset = pd.read_csv(path)
dataset.drop('ocean_proximity',axis=1,inplace=True)#dropping non-numerical features

print('Columns with NaN values before imputation: ',dataset.columns[dataset.isnull().any()].tolist())

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
temp = imputer.fit_transform(dataset)
dataset = pd.DataFrame(temp,columns=dataset.columns)

print("Columns with Nan after imputation: ",dataset.columns[dataset.isnull().any()].tolist())

print('\n\nUsing Imputer with another dataset: ')
dataset = pd.read_csv(path2)
dataset.drop(['Name','Sex','Ticket','Cabin','Embarked'],axis=1,inplace=True)
print('Dataset before imputation: ')
print(dataset[20:30])
imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=999.9)
temp = imputer.fit_transform(dataset)
dataset = pd.DataFrame(temp,columns=dataset.columns)
print('\nDataset after imputation:\n',dataset[20:30])
#values for strategy='mean/median/most_frequent/constant' if strategy=='constant' then fill_value=somevalue

Using Imputer to handle NaN values: 
Columns with NaN values before imputation:  ['total_bedrooms']
Columns with Nan after imputation:  []


Using Imputer with another dataset: 
Dataset before imputation: 
    PassengerId  Pclass   Age  SibSp  Parch      Fare
20          912       1  55.0      1      0   59.4000
21          913       3   9.0      0      1    3.1708
22          914       1   NaN      0      0   31.6833
23          915       1  21.0      0      1   61.3792
24          916       1  48.0      1      3  262.3750
25          917       3  50.0      1      0   14.5000
26          918       1  22.0      0      1   61.9792
27          919       3  22.5      0      0    7.2250
28          920       1  41.0      0      0   30.5000
29          921       3   NaN      2      0   21.6792

Dataset after imputation:
     PassengerId  Pclass    Age  SibSp  Parch      Fare
20        912.0     1.0   55.0    1.0    0.0   59.4000
21        913.0     3.0    9.0    0.0    1.0    3.1708
22     

In [10]:
print(bold+"Handling text and Categorical attributes:"+cbold)

[1mHandling text and Categorical attributes:[0m


In [11]:
dataset = pd.read_csv(path)
print('Preview of data:\n')
print(dataset.head())

print('\n\nUsing LabelEncoder class to convert string categorical data to integer values:\n')
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
temp = encoder.fit_transform(dataset['ocean_proximity'])

dataset.drop(['ocean_proximity'],axis=1,inplace=True)#dropping the previous categorical feature ocean_proximity
dataset['ocean_proximity'] = temp #appending the new integer based categorical feature
print(dataset.head())

print('\n\nUsing OneHotEncoder class to convert the integer categorical data into multiple binary-valued features called one-hot vectors:\n')
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
#fit_transform of OneHotEncoder requires 2D array, thus we use reshape on the dataset['ocean_proximity'] to obtain 2D array feature
temp = encoder.fit_transform(dataset['ocean_proximity'].values.reshape(-1,1))
temp = temp.toarray()#temp is a scipy sparse matrix and thus we converted it into 2D matrix form
dataset.drop('ocean_proximity',axis=1,inplace=True)#dropping previous ocean_proximity feature
dataset['op1'] = temp[:,0]#adding the one hot encoder vectors to the dataset
dataset['op2'] = temp[:,1]
dataset['op3'] = temp[:,2]
dataset['op4'] = temp[:,3]
dataset['op5'] = temp[:,4]
print(dataset.head())

Preview of data:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


Using LabelEncoder class to convert string categorical 

In [12]:
dataset = pd.read_csv(path)
print('Preview of data:\n')
print(dataset.head())
print('\nUsing LabelBinarizer class to convert string categorial data into integer and subsequently into one-hot vectors.\n')
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
#fit_transform of LabelBinarizer requires 2D array, thus we use reshape on the dataset['ocean_proximity'] to obtain 2D array feature
temp = encoder.fit_transform(dataset['ocean_proximity'].values.reshape(-1,1))
#temp is an np.array object and is a 2D matrix thus we do not have to use toarray() as we did in the previous case
dataset.drop(['ocean_proximity'],axis=1,inplace=True)#dropping previous ocean_proximity feature
dataset['op1'] = temp[:,0]#adding the one hot encoder vectors to the dataset
dataset['op2'] = temp[:,1]
dataset['op3'] = temp[:,2]
dataset['op4'] = temp[:,3]
dataset['op5'] = temp[:,4]
print(dataset.head())

Preview of data:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Using LabelBinarizer class to convert string categorial 

In [13]:
print(bold+'Custom Transformers:'+cbold)
dataset = pd.read_csv(path)
print('\nPreview of the dataset:\n')
print(dataset.head())

#custom transformer
from sklearn.base import BaseEstimator,TransformerMixin

rooms,bedrooms,population,households = 3,4,5,6

class AttributeAdder(BaseEstimator,TransformerMixin): #Custom transformer
    def __init__(self,bedrooms_room=True,rooms_household=True,population_household=True):
        self.bedrooms_room = bedrooms_room
        self.rooms_household = rooms_household
        self.population_household = population_household
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        print('\nFeatures added:')
        if self.bedrooms_room:#if true add feature bedrooms_per_room
            X = np.c_[X,X[:,bedrooms]/X[:,rooms]]
            print(' Bedrooms per room')    
                
        if self.rooms_household:#if true add feature rooms_per_household
            X = np.c_[X,X[:,rooms]/X[:,households]]
            print(' Rooms per household')
            
        if self.population_household:#if true add feature population_per_household
            X = np.c_[X,X[:,population]/X[:,households]]
            print(' Population per household')
        print('\n')
        return X
    
adder = AttributeAdder()#constructor provides 3 hyperparameters, we can use any or all
temp = adder.fit_transform(dataset.values)#it calls both fit() as well as transform(). This can be done because we inherited some classes
dataset = pd.DataFrame(temp)
print('Preview of dataset after adding new feature;\n')
print(dataset.head())

[1mCustom Transformers:[0m

Preview of the dataset:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Features added:
 Be

In [14]:
print(bold+'Feature Scaling:'+cbold)
dataset = pd.read_csv(path)

print('Preview of dataset:\n')
print(dataset.head())

from sklearn.preprocessing import MinMaxScaler
op = dataset['ocean_proximity']
dataset.drop(['ocean_proximity'],axis=1,inplace=True)#MinMaxScaler cannot operate on non-numerical values
scaler = MinMaxScaler(feature_range=(0,1))
temp = scaler.fit_transform(dataset)
dataset = pd.DataFrame(temp,columns=dataset.columns)
dataset['ocean_proximity'] = op#adding the dropped feature
print('\n\nDataset after MinMax scaling:\n')
print(dataset.head())

dataset = pd.read_csv(path)

from sklearn.preprocessing import StandardScaler
op = dataset['ocean_proximity']
dataset.drop(['ocean_proximity'],axis=1,inplace=True)#StandardScaler cannot operate on non-numerical values
scaler = StandardScaler()
temp = scaler.fit_transform(dataset)
dataset = pd.DataFrame(temp,columns=dataset.columns)
dataset['ocean_proximity'] = op#adding the dropped feature
print('\n\nDataset after Standard scaling:\n')
print(dataset.head())

[1mFeature Scaling:[0m
Preview of dataset:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


Dataset after MinMax scalin

In [15]:
print(bold+'Transformation Pipeline:'+cbold)
#Pipeling allows us to perform mutiple transformation in one go. Thus here we will combine all the transformation we have seen till now.

dataset = pd.read_csv(path)
print('Preview of data:\n')
print(dataset.head())

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

#In this case, transformations can be applied to numerical values and string values seperately
#Through this transformer class we could select the type of features to which transformation is to be applied
class AttributeSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attr):
        self.attr = attr
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
            X.drop(self.attr,axis=1,inplace=True)    
            return X

#The transformer class below is used to add the deleted category feature 'ocean_proximity' back to the dataset
#The number pipeline will be executed first, which would produce a dataframe with only numeric features, which is passed to category pipeline
#Thus, before category pipeline proecesses it, we use this class to add the category based feature 'ocean_proximity' to dataframe
class AppendingCatData(BaseEstimator,TransformerMixin):
    def __init__(self,attr):
        self.attr = attr
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X['ocean_proximity'] = self.attr
        return X    
        
        
#Pipeline for numerical features
numpipeline = Pipeline([
    ('selector',AttributeSelector(dataset.columns[(dataset.dtypes.values != np.dtype('float64'))])),#returns the titles of all the columns those do not have float64 type values
    ('imputer',SimpleImputer(strategy='median')),
    ('attri_adder',AttributeAdder()),
    ('scaling',StandardScaler()),    
]) 

#pipeline for categorical features(most probably string/character features)
catpipeline = Pipeline([    
    ('appendingCatAttr',AppendingCatData(dataset['ocean_proximity'])),
    ('selector',AttributeSelector(dataset.columns[(dataset.dtypes.values != np.dtype('object'))])),
    ('one_hot_encoder', OneHotEncoder(sparse=False))
])


from sklearn.pipeline import FeatureUnion
combinedpipeline = FeatureUnion(transformer_list=[  #Combining the pipeline for nummeric and non-numeric features
    ('number_pipeline',numpipeline),
    ('category_pipeline',catpipeline)
])
temp = combinedpipeline.fit_transform(dataset) #calling the combined pipeline to processes the complete data at once
dataset = pd.DataFrame(temp)
print('\n\nAfter data-processing pipeline:\n')
print(dataset.head())

[1mTransformation Pipeline:[0m
Preview of data:

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Features added:
 Bedroo