
## Data preprocessing with scikit learn

In [36]:
from sklearn import datasets
import pandas as pd

# load iris dataset
df=  pd.read_csv('house-price-predciction/data/train.csv')

In [37]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [39]:
df=pd.get_dummies(df)

In [40]:
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [41]:
#Standard scaling

from sklearn.preprocessing import scale
# Standardizing each column of pizza_data
col_standardized = scale(df)
print('{}\n'.format(repr(col_standardized)))

# Column means (rounded to nearest thousandth)
col_means = col_standardized.mean(axis=0).round(decimals=3)
print('{}\n'.format(repr(col_means)))

# Column standard deviations
col_stds = col_standardized.std(axis=0)
print('{}\n'.format(repr(col_stds)))

array([[-1.73086488,  0.07337496, -0.20803433, ..., -0.11785113,
         0.4676514 , -0.30599503],
       [-1.7284922 , -0.87256276,  0.40989452, ..., -0.11785113,
         0.4676514 , -0.30599503],
       [-1.72611953,  0.07337496, -0.08444856, ..., -0.11785113,
         0.4676514 , -0.30599503],
       ...,
       [ 1.72611953,  0.30985939, -0.16683907, ..., -0.11785113,
         0.4676514 , -0.30599503],
       [ 1.7284922 , -0.87256276, -0.08444856, ..., -0.11785113,
         0.4676514 , -0.30599503],
       [ 1.73086488, -0.87256276,  0.20391824, ..., -0.11785113,
         0.4676514 , -0.30599503]])

array([ 0., -0., nan, -0.,  0.,  0.,  0.,  0., nan, -0., -0., -0.,  0.,
        0., -0.,  0., -0.,  0.,  0.,  0.,  0.,  0.,  0., -0., -0., nan,
        0., -0.,  0.,  0., -0.,  0.,  0.,  0., -0.,  0.,  0.,  0.,  0.,
       -0.,  0.,  0., -0., -0.,  0.,  0.,  0., -0.,  0.,  0., -0.,  0.,
        0., -0.,  0.,  0.,  0., -0.,  0.,  0.,  0., -0., -0., -0.,  0.,
       -0., -0.,  0., -0.,

Range scaling
Apart from standardizing data, we can also scale data by compressing it into a fixed range. One of the biggest use cases for this is compressing data into the range [0, 1]. This allows us to view the data in terms of proportions, or percentages, based on the minimum and maximum values in the data.



In [15]:
#MinMaxScaler

# predefined data


from sklearn.preprocessing import MinMaxScaler
default_scaler = MinMaxScaler() # the default range is [0,1]
transformed = default_scaler.fit_transform(df)
print('{}\n'.format(repr(transformed)))

custom_scaler = MinMaxScaler(feature_range=(-2, 3))
transformed = custom_scaler.fit_transform(df)
print('{}\n'.format(repr(transformed)))



array([[0.22222222, 0.625     , 0.06779661, 0.04166667, 0.        ],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667, 0.        ],
       [0.11111111, 0.5       , 0.05084746, 0.04166667, 0.        ],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667, 0.        ],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667, 0.        ],
       [0.30555556, 0.79166667, 0.11864407, 0.125     , 0.        ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333, 0.        ],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667, 0.        ],
       [0.02777778, 0.375     , 0.06779661, 0.04166667, 0.        ],
       [0.16666667, 0.45833333, 0.08474576, 0.        , 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667, 0.        ],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667, 0.        ],
       [0.13888889, 0.41666667, 0.06779661, 0.        , 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        , 0.        ],
       [0.41666667, 0.83333333, 0.

. Data standardization uses each feature's mean and standard deviation, while ranged scaling uses the maximum and minimum feature values, meaning that they're both susceptible to being skewed by outlier values.

We can robustly scale the data, i.e. avoid being affected by outliers, by using use the data's median and Interquartile Range (IQR). Since the median and IQR are percentile measurements of the data (50% for median, 25% to 75% for the IQR), they are not affected by outliers. For the scaling method, we just subtract the median from each data value then scale to the IQR.

In [16]:
#Robust Scaler


from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
transformed = robust_scaler.fit_transform(df)
print('{}\n'.format(repr(transformed)))

array([[-0.53846154,  1.        , -0.84285714, -0.73333333, -0.5       ],
       [-0.69230769,  0.        , -0.84285714, -0.73333333, -0.5       ],
       [-0.84615385,  0.4       , -0.87142857, -0.73333333, -0.5       ],
       [-0.92307692,  0.2       , -0.81428571, -0.73333333, -0.5       ],
       [-0.61538462,  1.2       , -0.84285714, -0.73333333, -0.5       ],
       [-0.30769231,  1.8       , -0.75714286, -0.6       , -0.5       ],
       [-0.92307692,  0.8       , -0.84285714, -0.66666667, -0.5       ],
       [-0.61538462,  0.8       , -0.81428571, -0.73333333, -0.5       ],
       [-1.07692308, -0.2       , -0.84285714, -0.73333333, -0.5       ],
       [-0.69230769,  0.2       , -0.81428571, -0.8       , -0.5       ],
       [-0.30769231,  1.4       , -0.81428571, -0.73333333, -0.5       ],
       [-0.76923077,  0.8       , -0.78571429, -0.73333333, -0.5       ],
       [-0.76923077,  0.        , -0.84285714, -0.8       , -0.5       ],
       [-1.15384615,  0.        , -0.9

### L2 Normalization for rows

L2 normalization applied to a particular row of a data array will divide each value in that row by the row's L2 norm. In general terms, the L2 norm of a row is just the square root of the sum of squared values for the row.
Mainly used in clustering

In [17]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
transformed = normalizer.fit_transform(df)
print('{}\n'.format(repr(transformed)))

array([[0.80377277, 0.55160877, 0.22064351, 0.0315205 , 0.        ],
       [0.82813287, 0.50702013, 0.23660939, 0.03380134, 0.        ],
       [0.80533308, 0.54831188, 0.2227517 , 0.03426949, 0.        ],
       [0.80003025, 0.53915082, 0.26087943, 0.03478392, 0.        ],
       [0.790965  , 0.5694948 , 0.2214702 , 0.0316386 , 0.        ],
       [0.78417499, 0.5663486 , 0.2468699 , 0.05808704, 0.        ],
       [0.78010936, 0.57660257, 0.23742459, 0.0508767 , 0.        ],
       [0.80218492, 0.54548574, 0.24065548, 0.0320874 , 0.        ],
       [0.80642366, 0.5315065 , 0.25658935, 0.03665562, 0.        ],
       [0.81803119, 0.51752994, 0.25041771, 0.01669451, 0.        ],
       [0.80373519, 0.55070744, 0.22325977, 0.02976797, 0.        ],
       [0.786991  , 0.55745196, 0.26233033, 0.03279129, 0.        ],
       [0.82307218, 0.51442011, 0.24006272, 0.01714734, 0.        ],
       [0.8025126 , 0.55989251, 0.20529392, 0.01866308, 0.        ],
       [0.81120865, 0.55945424, 0.

## Data Imputation methods

In [18]:
#replace nan values with - mean/median/mode/constant

sepal_len    0
sepal_wid    0
petal_len    0
petal_wid    0
class        0
dtype: int64

In [None]:
#Use any method below -SimpleImputer


#mean
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer()
transformed = imp_mean.fit_transform(df)
print('{}\n'.format(repr(transformed)))

#median

imp_median = SimpleImputer()
transformed = imp_median.fit_transform(df)
print('{}\n'.format(repr(transformed)))


#frequency

transformed = imp_frequent.fit_transform(df)
print('{}\n'.format(repr(transformed)))


#constant

from sklearn.impute import SimpleImputer
imp_constant = SimpleImputer(strategy='constant',
                             fill_value= -1)
transformed = imp_constant.fit_transform(df)
print('{}\n'.format(repr(transformed)))

In [51]:
#I will use median
from sklearn.impute import SimpleImputer
imp_median = SimpleImputer()
df = imp_median.fit_transform(df)
df=pd.DataFrame(df)
df.isnull().any().sum()

0

## PCA
Most datasets contain a large number of features, some of which are redundant or not informative. For example, in a dataset of basketball statistics, the total points and points per game for a player will (most of the time) tell the same story about the player's scoring prowess.

When a dataset contains these types of correlated numeric features, we can perform principal component analysis (PCA) for dimensionality reduction (i.e. reducing the number of columns in the data array).

PCA extracts the principal components of the dataset, which are an uncorrelated set of latent variables that encompass most of the information from the original dataset. Using a smaller set of principal components can make it a lot easier to use the dataset in statistical or machine learning models (especially when the original dataset contains many correlated features).

In [52]:
df.shape

#data must not contain nan values

(1460, 290)

In [55]:
from sklearn.decomposition import PCA
pca_obj = PCA() # The value of n_component will be m-1. As m is 290 and default is always m-1
pc = pca_obj.fit_transform(df).round(3)


pca_obj = PCA(n_components=100)
pc = pca_obj.fit_transform(df).round(3)


pca_obj = PCA(n_components=150)
pc = pca_obj.fit_transform(df).round(3)

pc=pd.DataFrame(pc)
print(pc.head())

         0         1        2        3        4        5        6        7    \
0  27493.526 -2993.926 -296.466  734.036 -135.679 -236.249  701.300 -156.871   
1    547.725  -929.991  742.681  242.244  -30.971 -267.735  675.765  -62.938   
2  42579.462  -701.927 -477.039  418.128 -106.840 -335.701  648.921 -139.962   
3 -40929.737   409.251 -574.721  102.288  -69.323  -18.370  787.669   15.157   
4  69169.766  1418.332 -521.347  376.756  -93.305   27.672  802.254 -151.238   

       8        9    ...    140    141    142    143    144    145    146  \
0  -20.940  -74.175  ... -0.026 -0.043  0.021  0.004  0.004  0.014 -0.017   
1    6.806   81.294  ... -0.143  0.011  0.024  0.153  0.067  0.013  0.070   
2  -21.274  -86.516  ...  0.033 -0.024  0.029  0.022 -0.009  0.042 -0.010   
3  -98.600 -145.571  ... -0.069 -0.134 -0.058 -0.038 -0.095  0.013 -0.136   
4 -103.877 -282.743  ...  0.014  0.078 -0.036  0.062  0.044 -0.052  0.102   

     147    148    149  
0 -0.005  0.010  0.008  
1 -0.0

In [None]:
#Regression type of problems


In [None]:
#statistical methods work better on small data sets and are used by data scientists to get information/insights