## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Importing and Reading Data as DataFrame

In [2]:
df = pd.read_csv("../data/train_.csv")
df.shape

(1460, 68)

## Getting Info about data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 68 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1201 non-null   float64
 1   LotArea        1460 non-null   int64  
 2   MasVnrArea     1452 non-null   float64
 3   BsmtFinSF1     1460 non-null   int64  
 4   BsmtFinSF2     1460 non-null   int64  
 5   BsmtUnfSF      1460 non-null   int64  
 6   2ndFlrSF       1460 non-null   int64  
 7   LowQualFinSF   1460 non-null   int64  
 8   WoodDeckSF     1460 non-null   int64  
 9   OpenPorchSF    1460 non-null   int64  
 10  EnclosedPorch  1460 non-null   int64  
 11  3SsnPorch      1460 non-null   int64  
 12  ScreenPorch    1460 non-null   int64  
 13  PoolArea       1460 non-null   int64  
 14  MiscVal        1460 non-null   int64  
 15  YrSold         1460 non-null   int64  
 16  SalePrice      1460 non-null   int64  
 17  MSZoning       1460 non-null   object 
 18  Street  

## Splitting data into Numerical and Categorical datapoints

In [4]:
numerical_data = df[['LotFrontage', 'LotArea', 
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF',
       'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'YrSold']]

target = df['SalePrice']

In [5]:
categorical_data = df[['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 
       'SaleType', 'SaleCondition', "MSSubClass", "OverallQual", "OverallCond", 
        "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", 
        "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "MoSold"]]

## Calculating Missing Data

In [6]:
def missingDataCalculator(df):
    missing_data_columns = []
    print('Total Length of DataFrame : ', len(df))
    print('*******************************************')
    
    for cols in df.columns:
        missing_data_points = df[cols].isnull().sum()
        data_percentage = (missing_data_points/len(df))*100
    
        if (data_percentage) > 0:
            print('Missing Data Column : ', cols)
            print('Missing Data Points : ', missing_data_points)
            print('Missing Data Percentage : ', data_percentage)
            print('*******************************************')
            missing_data_columns.append(cols)
        else:
            pass
        
    return missing_data_columns

## Numerical Data

### Basic Data Analysis

In [7]:
print('Shape_Numerical_Columns : ', numerical_data.shape)
print('****************************************************')
print('Describe_Numerical_Columns : ', numerical_data.describe())
print('****************************************************')
print('Info_Numerical_Columns : ', numerical_data.info())

Shape_Numerical_Columns :  (1460, 16)
****************************************************
Describe_Numerical_Columns :         LotFrontage        LotArea   MasVnrArea   BsmtFinSF1   BsmtFinSF2  \
count  1201.000000    1460.000000  1452.000000  1460.000000  1460.000000   
mean     70.049958   10516.828082   103.685262   443.639726    46.549315   
std      24.284752    9981.264932   181.066207   456.098091   161.319273   
min      21.000000    1300.000000     0.000000     0.000000     0.000000   
25%      59.000000    7553.500000     0.000000     0.000000     0.000000   
50%      69.000000    9478.500000     0.000000   383.500000     0.000000   
75%      80.000000   11601.500000   166.000000   712.250000     0.000000   
max     313.000000  215245.000000  1600.000000  5644.000000  1474.000000   

         BsmtUnfSF     2ndFlrSF  LowQualFinSF   WoodDeckSF  OpenPorchSF  \
count  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   
mean    567.240411   346.992466      5.84452

#### There are some missing values in the data

### Calculating % of Missing data

In [8]:
missingDataCalculator(numerical_data)

Total Length of DataFrame :  1460
*******************************************
Missing Data Column :  LotFrontage
Missing Data Points :  259
Missing Data Percentage :  17.73972602739726
*******************************************
Missing Data Column :  MasVnrArea
Missing Data Points :  8
Missing Data Percentage :  0.547945205479452
*******************************************


['LotFrontage', 'MasVnrArea']

### Imputing Missing Data

In [9]:
numerical_missing_values_columns = ['LotFrontage', 'MasVnrArea']

for missing_col in numerical_missing_values_columns:
    numerical_data[missing_col] = numerical_data[missing_col].fillna(numerical_data[missing_col].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data[missing_col] = numerical_data[missing_col].fillna(numerical_data[missing_col].mean())


### Calculating % of Missing data

In [10]:
missingDataCalculator(numerical_data)

Total Length of DataFrame :  1460
*******************************************


[]

### Encoding Data

### Numerical Data

In [11]:
numerical_data

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,2ndFlrSF,LowQualFinSF,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold
0,65.0,8450,196.0,706,0,150,854,0,0,61,0,0,0,0,0,2008
1,80.0,9600,0.0,978,0,284,0,0,298,0,0,0,0,0,0,2007
2,68.0,11250,162.0,486,0,434,866,0,0,42,0,0,0,0,0,2008
3,60.0,9550,0.0,216,0,540,756,0,0,35,272,0,0,0,0,2006
4,84.0,14260,350.0,655,0,490,1053,0,192,84,0,0,0,0,0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,0.0,0,0,953,694,0,0,40,0,0,0,0,0,2007
1456,85.0,13175,119.0,790,163,589,0,0,349,0,0,0,0,0,0,2010
1457,66.0,9042,0.0,275,0,877,1152,0,0,60,0,0,0,0,2500,2010
1458,68.0,9717,0.0,49,1029,0,0,0,366,0,112,0,0,0,0,2010


In [12]:
scaler = StandardScaler()
numerical_data = scaler.fit_transform(numerical_data)

In [13]:
numerical_data

array([[-0.22937175, -0.20714171,  0.51141841, ..., -0.06869175,
        -0.08768781,  0.13877749],
       [ 0.4519361 , -0.09188637, -0.57441047, ..., -0.06869175,
        -0.08768781, -0.61443862],
       [-0.09311018,  0.07347998,  0.32306034, ..., -0.06869175,
        -0.08768781,  0.13877749],
       ...,
       [-0.18395123, -0.14781027, -0.57441047, ..., -0.06869175,
         4.95311151,  1.64520971],
       [-0.09311018, -0.08016039, -0.57441047, ..., -0.06869175,
        -0.08768781,  1.64520971],
       [ 0.22483348, -0.05811155, -0.57441047, ..., -0.06869175,
        -0.08768781,  0.13877749]])

In [14]:
numerical_data_scaled = pd.DataFrame(numerical_data, columns=['LotFrontage', 'LotArea', 
                                                               'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                                               'BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF',
                                                               'WoodDeckSF', 'OpenPorchSF',
                                                               'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
                                                               'PoolArea', 'MiscVal',
                                                               'YrSold'])

In [15]:
numerical_data_scaled

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,2ndFlrSF,LowQualFinSF,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold
0,-0.229372,-0.207142,0.511418,0.575425,-0.288653,-0.944591,1.161852,-0.120242,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.138777
1,0.451936,-0.091886,-0.574410,1.171992,-0.288653,-0.641228,-0.795163,-0.120242,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.614439
2,-0.093110,0.073480,0.323060,0.092907,-0.288653,-0.301643,1.189351,-0.120242,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.138777
3,-0.456474,-0.096897,-0.574410,-0.499274,-0.288653,-0.061670,0.937276,-0.120242,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.367655
4,0.633618,0.375148,1.364570,0.463568,-0.288653,-0.174865,1.617877,-0.120242,0.780197,0.563760,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.138777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.365633,-0.260560,-0.574410,-0.973018,-0.288653,0.873321,0.795198,-0.120242,-0.752176,-0.100558,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.614439
1456,0.679039,0.266407,0.084843,0.759659,0.722112,0.049262,-0.795163,-0.120242,2.033231,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,1.645210
1457,-0.183951,-0.147810,-0.574410,-0.369871,-0.288653,0.701265,1.844744,-0.120242,-0.752176,0.201405,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,1.645210
1458,-0.093110,-0.080160,-0.574410,-0.865548,6.092188,-1.284176,-0.795163,-0.120242,2.168910,-0.704483,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,1.645210


## Categorical Data

### Basic Data Analysis

In [16]:
print('Shape_Categorical_Columns : ', categorical_data.shape)
print('****************************************************')
print('Describe_Categorical_Columns : ', categorical_data.describe())
print('****************************************************')
print('Info_Categorical_Columns : ', categorical_data.info())

Shape_Categorical_Columns :  (1460, 51)
****************************************************
Describe_Categorical_Columns :          MSSubClass  OverallQual  OverallCond  BsmtFullBath  BsmtHalfBath  \
count  1460.000000  1460.000000  1460.000000   1460.000000   1460.000000   
mean     56.897260     6.099315     5.575342      0.425342      0.057534   
std      42.300571     1.382997     1.112799      0.518911      0.238753   
min      20.000000     1.000000     1.000000      0.000000      0.000000   
25%      20.000000     5.000000     5.000000      0.000000      0.000000   
50%      50.000000     6.000000     5.000000      0.000000      0.000000   
75%      70.000000     7.000000     6.000000      1.000000      0.000000   
max     190.000000    10.000000     9.000000      3.000000      2.000000   

          FullBath     HalfBath  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd  \
count  1460.000000  1460.000000   1460.000000   1460.000000   1460.000000   
mean      1.565068     0.382877     

### Calculating % of Missing data

In [17]:
missing_data_columns = missingDataCalculator(categorical_data)

Total Length of DataFrame :  1460
*******************************************
Missing Data Column :  MasVnrType
Missing Data Points :  8
Missing Data Percentage :  0.547945205479452
*******************************************
Missing Data Column :  BsmtQual
Missing Data Points :  37
Missing Data Percentage :  2.5342465753424657
*******************************************
Missing Data Column :  BsmtCond
Missing Data Points :  37
Missing Data Percentage :  2.5342465753424657
*******************************************
Missing Data Column :  BsmtExposure
Missing Data Points :  38
Missing Data Percentage :  2.6027397260273974
*******************************************
Missing Data Column :  BsmtFinType1
Missing Data Points :  37
Missing Data Percentage :  2.5342465753424657
*******************************************
Missing Data Column :  BsmtFinType2
Missing Data Points :  38
Missing Data Percentage :  2.6027397260273974
*******************************************
Missing Data Column : 

### Imputing Categorical Missing Data

In [18]:
for missing_col in missing_data_columns:
    categorical_data[missing_col] = categorical_data[missing_col].fillna(str(categorical_data[missing_col].mode()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_data[missing_col] = categorical_data[missing_col].fillna(str(categorical_data[missing_col].mode()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_data[missing_col] = categorical_data[missing_col].fillna(str(categorical_data[missing_col].mode()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

### Calculating % of Missing data

In [19]:
missingDataCalculator(categorical_data)

Total Length of DataFrame :  1460
*******************************************


[]

### Concatenating Numerical and Categorical Data

In [20]:
dfs = [numerical_data_scaled, categorical_data]
data = pd.concat(dfs, axis=1)

In [21]:
data.shape

(1460, 67)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 67 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1460 non-null   float64
 1   LotArea        1460 non-null   float64
 2   MasVnrArea     1460 non-null   float64
 3   BsmtFinSF1     1460 non-null   float64
 4   BsmtFinSF2     1460 non-null   float64
 5   BsmtUnfSF      1460 non-null   float64
 6   2ndFlrSF       1460 non-null   float64
 7   LowQualFinSF   1460 non-null   float64
 8   WoodDeckSF     1460 non-null   float64
 9   OpenPorchSF    1460 non-null   float64
 10  EnclosedPorch  1460 non-null   float64
 11  3SsnPorch      1460 non-null   float64
 12  ScreenPorch    1460 non-null   float64
 13  PoolArea       1460 non-null   float64
 14  MiscVal        1460 non-null   float64
 15  YrSold         1460 non-null   float64
 16  MSZoning       1460 non-null   object 
 17  Street         1460 non-null   object 
 18  LotShape

### Encoding

### Categorical Data

In [23]:
categorical_data

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,MoSold
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,1,0,2,1,3,1,8,0,2,2
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,0,1,2,0,3,1,6,1,2,5
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,1,0,2,1,3,1,6,1,2,9
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,1,0,1,0,3,1,7,1,3,2
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,1,0,2,1,4,1,9,1,3,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,0,0,2,1,3,1,7,1,2,8
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,1,0,2,0,3,1,7,2,2,2
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,0,0,2,0,4,1,9,2,1,5
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,1,0,1,0,2,1,5,0,1,4


In [24]:
# categorical_data = df.select_dtypes(['O'])

In [25]:
categorical_data

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,MoSold
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,1,0,2,1,3,1,8,0,2,2
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,0,1,2,0,3,1,6,1,2,5
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,1,0,2,1,3,1,6,1,2,9
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,1,0,1,0,3,1,7,1,3,2
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,1,0,2,1,4,1,9,1,3,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,0,0,2,1,3,1,7,1,2,8
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,1,0,2,0,3,1,7,2,2,2
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,0,0,2,0,4,1,9,2,1,5
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,1,0,1,0,2,1,5,0,1,4


In [26]:
categorical_data = categorical_data.apply(LabelEncoder().fit_transform)

In [27]:
categorical_data

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,MoSold
0,3,1,3,3,0,4,0,5,2,2,...,1,0,2,1,3,1,6,0,2,1
1,3,1,3,3,0,2,0,24,1,2,...,0,1,2,0,3,1,4,1,2,4
2,3,1,0,3,0,4,0,5,2,2,...,1,0,2,1,3,1,4,1,2,8
3,3,1,0,3,0,0,0,6,2,2,...,1,0,1,0,3,1,5,1,3,1
4,3,1,0,3,0,2,0,15,2,2,...,1,0,2,1,4,1,7,1,3,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3,1,3,3,0,4,0,8,2,2,...,0,0,2,1,3,1,5,1,2,7
1456,3,1,3,3,0,4,0,14,2,2,...,1,0,2,0,3,1,5,2,2,1
1457,3,1,3,3,0,4,0,6,2,2,...,0,0,2,0,4,1,7,2,1,4
1458,3,1,3,3,0,4,0,12,2,2,...,1,0,1,0,2,1,3,0,1,3


### Concatenating Numerical and Categorical Data

In [28]:
data = pd.concat([numerical_data_scaled, categorical_data], axis = 1)

### Splitting Target and Train test Splits

In [29]:
data['SalePrice'] = target

In [30]:
target.isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
1455    False
1456    False
1457    False
1458    False
1459    False
Name: SalePrice, Length: 1460, dtype: bool

In [31]:
target.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1460 entries, 0 to 1459
Series name: SalePrice
Non-Null Count  Dtype
--------------  -----
1460 non-null   int64
dtypes: int64(1)
memory usage: 11.5 KB


In [32]:
target.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

#### Adding target to DataFrame

### Saving to a DataFrame

In [33]:
data.to_csv('../data/train__.csv', index = False)