# Introduction 

# Step 0

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from numpy import linalg as LA
import matplotlib.pyplot as plt

from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [None]:
# Step 0
# Load the data 

In [None]:
df = pd.read_csv('/content/sample_data/houses-prices.csv')

# Step 1

In [None]:
# Step 1
# Start to understand them :
#    - In a business way (description, google, ...) 
#    - In a mathematical way (correlation between them, range, mean , type, ...)


In [None]:
df.corr()['SalePrice']

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [None]:
columns_str=['MSZoning','Street','Alley',
            'LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood',
             'Condition1','Condition2','BldgType','HouseStyle',
            'RoofStyle','RoofMatl','Exterior1st',"Exterior2nd","MasVnrType","ExterQual",
            'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure',
            'BsmtFinType1','BsmtFinType2','Heating','HeatingQC','Electrical',
            'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish',
            'GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature',
            'SaleType','SaleCondition']

col_bin=["CentralAir"]

In [None]:
str_keys=pd.DataFrame(columns=columns_str)
max_len=0
my_list=[]
for i in columns_str:
    unique_len=len(df[i].unique())
    if unique_len>max_len:
        max_len=unique_len
for i in columns_str:
    temp=[]
    for u in range(max_len):
        if u >= len(df[i].unique()):
            temp.append(np.nan)
        else:
            temp.append(df[i].unique()[u])
    my_list.append(temp)

In [None]:
np.asarray(my_list)

array([['RL', 'RM', 'C (all)', ..., 'nan', 'nan', 'nan'],
       ['Pave', 'Grvl', 'nan', ..., 'nan', 'nan', 'nan'],
       ['nan', 'Grvl', 'Pave', ..., 'nan', 'nan', 'nan'],
       ...,
       ['nan', 'Shed', 'Gar2', ..., 'nan', 'nan', 'nan'],
       ['WD', 'New', 'COD', ..., 'nan', 'nan', 'nan'],
       ['Normal', 'Abnorml', 'Partial', ..., 'nan', 'nan', 'nan']],
      dtype='<U7')

In [None]:
str_keys=pd.DataFrame(columns=columns_str)
for i,u in enumerate(columns_str):
    str_keys[u]=np.asarray(my_list[i])

In [None]:
for ind,col in enumerate(df.columns):
    if col in columns_str:
        num=str_keys[col].dropna().index
        val=str_keys[col].dropna().values
        df.iloc[:,ind]=df[col].replace(val,num)
    

In [None]:
str_keys

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RM,Grvl,Grvl,IR1,Bnk,NoSeWa,FR2,Mod,Veenker,Feedr,Artery,2fmCon,1Story,Hip,WdShngl,MetalSd,MetalSd,,TA,Gd,CBlock,TA,Gd,Gd,ALQ,BLQ,GasW,Gd,FuseF,TA,Min1,TA,Detchd,Unf,Fa,Fa,N,Ex,MnPrv,Shed,New,Abnorml
2,C (all),,Pave,IR2,Low,,Corner,Sev,Crawfor,PosN,RRNn,Duplex,1.5Fin,Gambrel,Metal,Wd Sdng,Wd Shng,Stone,Ex,Fa,BrkTil,Ex,,Mn,Unf,,Grav,TA,FuseA,Ex,Maj1,Gd,BuiltIn,Fin,Gd,,P,Fa,GdWo,Gar2,COD,Partial
3,FV,,,IR3,HLS,,CulDSac,,NoRidge,Artery,Feedr,TwnhsE,1.5Unf,Mansard,WdShake,HdBoard,HdBoard,BrkCmn,Fa,Po,Wood,,Fa,Av,Rec,ALQ,Wall,Fa,FuseP,Fa,Min2,Fa,CarPort,,,Gd,,Gd,GdPrv,Othr,ConLD,AdjLand
4,RH,,,,,,FR3,,Mitchel,RRAe,PosN,Twnhs,SFoyer,Flat,Membran,BrkFace,Plywood,,,Ex,Slab,Fa,Po,,BLQ,Rec,OthW,Po,Mix,,Mod,Ex,,,Ex,Po,,,MnWw,TenC,ConLI,Alloca
5,,,,,,,,,Somerst,RRNn,PosA,,SLvl,Shed,Tar&Grv,WdShing,Wd Sdng,,,,Stone,,,,,LwQ,Floor,,,,Maj2,Po,Basment,,Po,Ex,,,,,CWD,Family
6,,,,,,,,,NWAmes,RRAn,RRAn,,2.5Unf,,Roll,CemntBd,CmentBd,,,,,,,,LwQ,GLQ,,,,,Sev,,2Types,,,,,,,,ConLw,
7,,,,,,,,,OldTown,PosA,RRAe,,2.5Fin,,ClyTile,Plywood,BrkFace,,,,,,,,,,,,,,,,,,,,,,,,Con,
8,,,,,,,,,BrkSide,RRNe,,,,,,AsbShng,Stucco,,,,,,,,,,,,,,,,,,,,,,,,Oth,
9,,,,,,,,,Sawyer,,,,,,,Stucco,AsbShng,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,0,65.0,8450,0,,0,0,0,0,0,0,0,0,0,0,7,5,2003,2003,0,0,0,0,0.0,196.0,0,0,0,0.0,0.0,0.0,0.0,706,0.0,0,150,856,0,...,Y,0.0,856,854,0,1710,1,0,2,1,3,1,0,8,0,0,,0.0,2003.0,0.0,2,548,0.0,0.0,0,0,61,0,0,0,0,,,,0,2,2008,0,0,208500
1,2,20,0,80.0,9600,0,,0,0,0,1,0,1,1,0,0,1,6,8,1976,1976,0,0,1,1,1.0,0.0,1,0,1,0.0,0.0,1.0,1.0,978,0.0,0,284,1262,0,...,Y,0.0,1262,0,0,1262,0,1,2,0,3,1,1,6,0,1,1.0,0.0,1976.0,0.0,2,460,0.0,0.0,0,298,0,0,0,0,0,,,,0,5,2007,0,0,181500
2,3,60,0,68.0,11250,0,,1,0,0,0,0,0,0,0,0,0,7,5,2001,2002,0,0,0,0,0.0,162.0,0,0,0,0.0,0.0,2.0,0.0,486,0.0,0,434,920,0,...,Y,0.0,920,866,0,1786,1,0,2,1,3,1,0,6,0,1,1.0,0.0,2001.0,0.0,2,608,0.0,0.0,0,0,42,0,0,0,0,,,,0,9,2008,0,0,223500
3,4,70,0,60.0,9550,0,,1,0,0,2,0,2,0,0,0,0,7,5,1915,1970,0,0,2,2,1.0,0.0,1,0,2,1.0,1.0,0.0,1.0,216,0.0,0,540,756,0,...,Y,0.0,961,756,0,1717,1,0,1,0,3,1,0,7,0,1,2.0,1.0,1998.0,1.0,3,642,0.0,0.0,0,0,35,272,0,0,0,,,,0,2,2006,0,1,140000
4,5,60,0,84.0,14260,0,,1,0,0,1,0,3,0,0,0,0,8,5,2000,2000,0,0,0,0,0.0,350.0,0,0,0,0.0,0.0,3.0,0.0,655,0.0,0,490,1145,0,...,Y,0.0,1145,1053,0,2198,1,0,2,1,4,1,0,9,0,1,1.0,0.0,2000.0,0.0,3,836,0.0,0.0,0,192,84,0,0,0,0,,,,0,12,2008,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,0,62.0,7917,0,,0,0,0,0,0,17,0,0,0,0,6,5,1999,2000,0,0,0,0,1.0,0.0,1,0,0,0.0,0.0,0.0,2.0,0,0.0,0,953,953,0,...,Y,0.0,953,694,0,1647,0,0,2,1,3,1,1,7,0,1,1.0,0.0,1999.0,0.0,2,460,0.0,0.0,0,0,40,0,0,0,0,,,,0,8,2007,0,0,175000
1456,1457,20,0,85.0,13175,0,,0,0,0,0,0,6,0,0,0,1,6,6,1978,1988,0,0,7,4,2.0,119.0,1,0,1,0.0,0.0,0.0,1.0,790,4.0,163,589,1542,0,...,Y,0.0,2073,0,0,2073,1,0,2,0,3,1,1,7,1,2,1.0,0.0,1978.0,1.0,2,500,0.0,0.0,0,349,0,0,0,0,0,,1.0,,0,2,2010,0,0,210000
1457,1458,70,0,66.0,9042,0,,0,0,0,0,0,2,0,0,0,0,7,9,1941,2006,0,0,6,6,1.0,0.0,2,1,5,1.0,1.0,0.0,0.0,275,0.0,0,877,1152,0,...,Y,0.0,1188,1152,0,2340,0,0,2,0,4,1,0,9,0,2,2.0,0.0,1941.0,0.0,1,252,0.0,0.0,0,0,60,0,0,0,0,,3.0,1.0,2500,5,2010,0,0,266500
1458,1459,20,0,68.0,9717,0,,0,0,0,0,0,11,0,0,0,1,5,6,1950,1996,1,0,1,1,1.0,0.0,1,0,1,1.0,0.0,2.0,0.0,49,4.0,1029,0,1078,0,...,Y,2.0,1078,0,0,1078,1,0,1,0,2,1,0,5,0,0,,0.0,1950.0,1.0,1,240,0.0,0.0,0,366,0,112,0,0,0,,,,0,4,2010,0,0,142125


In [None]:
df.describe()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460.0,1201.0,1460.0,1460.0,91.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1452.0,1460.0,1460.0,1460.0,1423.0,1423.0,1422.0,1423.0,1460.0,1422.0,1460.0,1460.0,1460.0,1460.0,1460.0,1459.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,770.0,1379.0,1379.0,1379.0,1460.0,1460.0,1379.0,1379.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,7.0,281.0,54.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,0.340411,70.049958,10516.828082,0.00411,1.450549,0.408219,0.195205,0.000685,0.596575,0.062329,9.478082,0.383562,0.034247,0.444521,1.144521,6.099315,5.575342,1971.267808,1984.865753,0.267808,0.065068,2.203425,2.532192,0.802342,103.685262,0.720548,0.14863,0.726712,0.724526,0.14617,0.720816,1.767393,443.639726,0.436006,46.549315,567.240411,1057.429452,0.039041,0.854795,0.156271,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,0.720548,6.517808,0.172603,0.613014,1.776623,0.522843,1978.506164,0.949239,1.767123,472.980137,0.074692,0.072516,0.10274,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,2.142857,1.729537,1.166667,43.489041,6.321918,2007.815753,0.234932,0.35,180921.19589
std,421.610009,42.300571,0.798309,24.284752,9981.264932,0.063996,0.500305,0.582296,0.64201,0.026171,1.010359,0.276232,5.837783,1.208607,0.374191,1.068108,1.324754,1.382997,1.112799,30.202904,20.645407,0.611903,0.539603,2.507095,2.662342,0.630202,181.066207,0.574763,0.440222,0.831177,0.815641,0.575885,1.137971,1.618287,456.098091,1.313514,161.319273,441.866955,438.705324,0.306179,0.959501,0.530529,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,0.706371,1.625393,0.717214,0.644666,0.871138,0.907168,24.689725,0.747733,0.747315,213.804841,0.398934,0.44326,0.365201,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,0.899735,0.924749,0.57462,496.123024,2.703626,1.328095,0.804553,0.88787,79442.502883
min,1.0,20.0,0.0,21.0,1300.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,2006.0,0.0,0.0,34900.0
25%,365.75,20.0,0.0,59.0,7553.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,223.0,795.75,0.0,0.0,0.0,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,0.0,5.0,0.0,0.0,1.0,0.0,1961.0,0.0,1.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,1.0,1.0,0.0,5.0,2007.0,0.0,0.0,129975.0
50%,730.5,50.0,0.0,69.0,9478.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,1.0,6.0,5.0,1973.0,1994.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,383.5,0.0,0.0,477.5,991.5,0.0,0.0,0.0,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,1.0,6.0,0.0,1.0,2.0,0.0,1980.0,1.0,2.0,480.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,6.0,2008.0,0.0,0.0,163000.0
75%,1095.25,70.0,0.0,80.0,11601.5,0.0,2.0,1.0,0.0,0.0,1.0,0.0,13.0,0.0,0.0,0.0,1.0,7.0,6.0,2000.0,2004.0,0.0,0.0,3.0,4.0,1.0,166.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,712.25,0.0,0.0,808.0,1298.25,0.0,2.0,0.0,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,1.0,7.0,0.0,1.0,2.0,1.0,2002.0,2.0,2.0,576.0,0.0,0.0,0.0,168.0,68.0,0.0,0.0,0.0,0.0,3.0,2.0,1.0,0.0,8.0,2009.0,0.0,0.0,214000.0
max,1460.0,190.0,4.0,313.0,215245.0,1.0,2.0,3.0,3.0,1.0,4.0,2.0,24.0,8.0,7.0,4.0,7.0,10.0,9.0,2010.0,2010.0,5.0,7.0,14.0,15.0,3.0,1600.0,3.0,4.0,5.0,4.0,4.0,3.0,6.0,5644.0,6.0,1474.0,2336.0,6110.0,5.0,4.0,4.0,4692.0,2065.0,572.0,5642.0,3.0,2.0,3.0,2.0,8.0,3.0,3.0,14.0,6.0,3.0,5.0,6.0,2010.0,2.0,4.0,1418.0,5.0,5.0,2.0,857.0,547.0,552.0,508.0,480.0,738.0,3.0,4.0,4.0,15500.0,12.0,2010.0,8.0,5.0,755000.0


# Step 2

In [None]:
# Step 2
# Choose a problem :
# - Price prediction based on caracteristic (Regression problem)


Comments 

# Step 3

In [None]:
# Step 3
# Try a naive way to first get a firstscore (the least with no real improvement) :
#   Use the simplest algorithm
#   Use raw data (no improvement, don't use if it's not fitting)
#   train and score using train test spliting

# measure the performances :
# Score of prediction (choose depending on your probleme, precision or F1)
# time to train
# time to predict

# We will uses theses metrics to track our improvement :

## test on raw data

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
df=df.apply(pd.to_numeric, errors='coerce')
df=df.fillna(0)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,0,65.0,8450,0,0.0,0,0,0,0,0,0,0,0,0,0,7,5,2003,2003,0,0,0,0,0.0,196.0,0,0,0,0.0,0.0,0.0,0.0,706,0.0,0,150,856,0,...,0.0,0.0,856,854,0,1710,1,0,2,1,3,1,0,8,0,0,0.0,0.0,2003.0,0.0,2,548,0.0,0.0,0,0,61,0,0,0,0,0.0,0.0,0.0,0,2,2008,0,0,208500
1,2,20,0,80.0,9600,0,0.0,0,0,0,1,0,1,1,0,0,1,6,8,1976,1976,0,0,1,1,1.0,0.0,1,0,1,0.0,0.0,1.0,1.0,978,0.0,0,284,1262,0,...,0.0,0.0,1262,0,0,1262,0,1,2,0,3,1,1,6,0,1,1.0,0.0,1976.0,0.0,2,460,0.0,0.0,0,298,0,0,0,0,0,0.0,0.0,0.0,0,5,2007,0,0,181500
2,3,60,0,68.0,11250,0,0.0,1,0,0,0,0,0,0,0,0,0,7,5,2001,2002,0,0,0,0,0.0,162.0,0,0,0,0.0,0.0,2.0,0.0,486,0.0,0,434,920,0,...,0.0,0.0,920,866,0,1786,1,0,2,1,3,1,0,6,0,1,1.0,0.0,2001.0,0.0,2,608,0.0,0.0,0,0,42,0,0,0,0,0.0,0.0,0.0,0,9,2008,0,0,223500
3,4,70,0,60.0,9550,0,0.0,1,0,0,2,0,2,0,0,0,0,7,5,1915,1970,0,0,2,2,1.0,0.0,1,0,2,1.0,1.0,0.0,1.0,216,0.0,0,540,756,0,...,0.0,0.0,961,756,0,1717,1,0,1,0,3,1,0,7,0,1,2.0,1.0,1998.0,1.0,3,642,0.0,0.0,0,0,35,272,0,0,0,0.0,0.0,0.0,0,2,2006,0,1,140000
4,5,60,0,84.0,14260,0,0.0,1,0,0,1,0,3,0,0,0,0,8,5,2000,2000,0,0,0,0,0.0,350.0,0,0,0,0.0,0.0,3.0,0.0,655,0.0,0,490,1145,0,...,0.0,0.0,1145,1053,0,2198,1,0,2,1,4,1,0,9,0,1,1.0,0.0,2000.0,0.0,3,836,0.0,0.0,0,192,84,0,0,0,0,0.0,0.0,0.0,0,12,2008,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,0,62.0,7917,0,0.0,0,0,0,0,0,17,0,0,0,0,6,5,1999,2000,0,0,0,0,1.0,0.0,1,0,0,0.0,0.0,0.0,2.0,0,0.0,0,953,953,0,...,0.0,0.0,953,694,0,1647,0,0,2,1,3,1,1,7,0,1,1.0,0.0,1999.0,0.0,2,460,0.0,0.0,0,0,40,0,0,0,0,0.0,0.0,0.0,0,8,2007,0,0,175000
1456,1457,20,0,85.0,13175,0,0.0,0,0,0,0,0,6,0,0,0,1,6,6,1978,1988,0,0,7,4,2.0,119.0,1,0,1,0.0,0.0,0.0,1.0,790,4.0,163,589,1542,0,...,0.0,0.0,2073,0,0,2073,1,0,2,0,3,1,1,7,1,2,1.0,0.0,1978.0,1.0,2,500,0.0,0.0,0,349,0,0,0,0,0,0.0,1.0,0.0,0,2,2010,0,0,210000
1457,1458,70,0,66.0,9042,0,0.0,0,0,0,0,0,2,0,0,0,0,7,9,1941,2006,0,0,6,6,1.0,0.0,2,1,5,1.0,1.0,0.0,0.0,275,0.0,0,877,1152,0,...,0.0,0.0,1188,1152,0,2340,0,0,2,0,4,1,0,9,0,2,2.0,0.0,1941.0,0.0,1,252,0.0,0.0,0,0,60,0,0,0,0,0.0,3.0,1.0,2500,5,2010,0,0,266500
1458,1459,20,0,68.0,9717,0,0.0,0,0,0,0,0,11,0,0,0,1,5,6,1950,1996,1,0,1,1,1.0,0.0,1,0,1,1.0,0.0,2.0,0.0,49,4.0,1029,0,1078,0,...,0.0,2.0,1078,0,0,1078,1,0,1,0,2,1,0,5,0,0,0.0,0.0,1950.0,1.0,1,240,0.0,0.0,0,366,0,112,0,0,0,0.0,0.0,0.0,0,4,2010,0,0,142125


In [None]:
df2 = df.copy()

In [None]:
size=len(df)
X_train=np.asarray(df2.drop(columns='SalePrice').iloc[:int(0.7*size),:])
Y_train=np.asarray(df2['SalePrice'].iloc[:int(0.7*size)])

X_test=np.asarray(df2.drop(columns='SalePrice').iloc[int(0.7*size):,:])
Y_test=np.asarray(df2['SalePrice'].iloc[int(0.7*size):])

In [None]:
reg = LinearRegression().fit(X_train, Y_train)
reg.score(X_test, Y_test)


0.585541768917838

# Step 4

In [None]:
# Step 4
# Try to reduced the dataset using a naive way (techniques seen in courses 2 - features selections)
# measure the naive algorithm performances on this REDUCED dataset:
# Score of prediction (choose depending on your probleme, precision or F1)
# time to train
# size of reduced dataset

# We will uses theses metrics to track our improvement :

Feature selction using SelectFromModel

In order to keep fetures which really matters for our prediction, we will select only certain fetures thanks to a true or false list. This list would be a naive way of choosing the right to keep. 

We use the SelectFromModel but we could have chosen others. This one is more mathematics. 

We drop first our target. And define it as the target. 

In [None]:
df3 = df.copy()

In [None]:
df3_features = df3.drop(columns='SalePrice')
df3_target = df3['SalePrice']


df3_columns = list(df.columns)
df3_columns2 = df3_columns.remove('SalePrice')
print(df3_columns)

['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC'

We apply again the Linear Regression as an estimator. 

In [None]:
# Choose and load the estimator / model
linear_regression_estimator = LinearRegression()

# Load the selector
select_from_model_selector = SelectFromModel(linear_regression_estimator, max_features=5, threshold=-np.inf)

# Fit the selector on data
select_from_model_selector.fit(df3_features, df3_target)



Let's create a new dataframe with the columns corresponding to all the features, their coefficient associated and if they are selected or not (with True or False). 

In [None]:
# Gather the features selected
df3_features_selected = pd.DataFrame(df3_columns,columns=["features"])

# display the coeff, the features and the selected status
df3_features_selected['coeff'] = select_from_model_selector.estimator_.coef_
df3_features_selected['is_selected'] = select_from_model_selector.get_support()


Now, we can see which features would be relevant to select. We observe that these features are the most correlate to the sale price : 
- KitchenAbvGr : kitchens abode grade
- OverallQual : rates the orverall material and finish of the house
- Utilities 
- PoolQC : Pool quality
- Street

In [None]:

df3_features_selected.sort_values('is_selected', ascending = False,inplace=True)
print(df3_features_selected)


         features          coeff  is_selected
52   KitchenAbvGr  -19167.092670         True
17    OverallQual   13776.473338         True
9       Utilities  -57393.123733         True
72         PoolQC -189346.928836         True
5          Street  -35547.452775         True
..            ...            ...          ...
76         MoSold     -25.640074        False
77         YrSold     -85.546849        False
78       SaleType    2500.578872        False
62     GarageArea       8.597381        False
79  SaleCondition     643.576122        False

[80 rows x 3 columns]


Let's try hte RFE on our dataset. The RFE is the recursive feature elimination. It removes each features based on their coef score provide by the estimator (ie LR or SVC) : the feature with the lowest coef is removed. Improvement is'nt based on the model score, but on the model coef calculation : The RFE choice the features depending on model, it's not always relevant. 

In [None]:
# Load the selector on a defined number of feature to select
rfe_selector = RFE(linear_regression_estimator, n_features_to_select=4)
# fit on train set
fited_rfe_selector = rfe_selector.fit(df3_features, df3_target)


In [None]:

df3_features_selected['ranking'] = fited_rfe_selector.ranking_
df3_features_selected.sort_values('ranking', inplace=True)

print(df3_features_selected)

         features         coeff  is_selected  ranking
16     HouseStyle    314.098266        False        1
7        LotShape   1480.345616        False        1
63     GarageQual   6557.564331        False        1
10      LotConfig   1011.916290        False        1
47   BsmtFullBath   6563.000064        False        2
..            ...           ...          ...      ...
76         MoSold    -25.640074        False       73
5          Street -35547.452775         True       74
68  EnclosedPorch     -0.157509        False       75
52   KitchenAbvGr -19167.092670         True       76
12   Neighborhood   -276.720543        False       77

[80 rows x 4 columns]


# Step 5

In [None]:
# Step 5
# based on you first discovery, clean and enhanced data :
#    - convert label into number
#    - add features (calculated from other, look on hide information )
#    -  ...

# Step 6

In [None]:
# Step 6
# Try to improve your SCORE on ENHANCED dataset :
# using your first algorithm
# using other algorithm

# for each model/algorithm, measure the performances :
# Score of prediction (choose depending on your probleme, precision or F1)
# time to train
# size of used dataset

In [None]:
df4 = df.copy()

# we check that there is no nan and inf
df4[~df4.isin([np.nan, np.inf, -np.inf]).any(1)]

# we put everything in float
df4= df4.astype(float)

print(df4)

          Id  MSSubClass  MSZoning  ...  SaleType  SaleCondition  SalePrice
0        1.0        60.0       0.0  ...       0.0            0.0   208500.0
1        2.0        20.0       0.0  ...       0.0            0.0   181500.0
2        3.0        60.0       0.0  ...       0.0            0.0   223500.0
3        4.0        70.0       0.0  ...       0.0            1.0   140000.0
4        5.0        60.0       0.0  ...       0.0            0.0   250000.0
...      ...         ...       ...  ...       ...            ...        ...
1455  1456.0        60.0       0.0  ...       0.0            0.0   175000.0
1456  1457.0        20.0       0.0  ...       0.0            0.0   210000.0
1457  1458.0        70.0       0.0  ...       0.0            0.0   266500.0
1458  1459.0        20.0       0.0  ...       0.0            0.0   142125.0
1459  1460.0        20.0       0.0  ...       0.0            0.0   147500.0

[1460 rows x 81 columns]


In [None]:
df4_features = df4.drop('SalePrice', axis=1)
df4_target = df4['SalePrice']
print(df4_features)

          Id  MSSubClass  MSZoning  ...  YrSold  SaleType  SaleCondition
0        1.0        60.0       0.0  ...  2008.0       0.0            0.0
1        2.0        20.0       0.0  ...  2007.0       0.0            0.0
2        3.0        60.0       0.0  ...  2008.0       0.0            0.0
3        4.0        70.0       0.0  ...  2006.0       0.0            1.0
4        5.0        60.0       0.0  ...  2008.0       0.0            0.0
...      ...         ...       ...  ...     ...       ...            ...
1455  1456.0        60.0       0.0  ...  2007.0       0.0            0.0
1456  1457.0        20.0       0.0  ...  2010.0       0.0            0.0
1457  1458.0        70.0       0.0  ...  2010.0       0.0            0.0
1458  1459.0        20.0       0.0  ...  2010.0       0.0            0.0
1459  1460.0        20.0       0.0  ...  2008.0       0.0            0.0

[1460 rows x 80 columns]


We drop the columns if the present the same values

In [None]:
#df4_features = df4.drop('MSZoning')
#, 'SaleType', 'Street', 'Alley', 'LandContour', 'Utilities','LandSlope'

KeyError: ignored

In [None]:
#using linalg from numpy, extract the eigen values from the avocado dataset

values = LA.eigvals(df4_features.corr())



LinAlgError: ignored

In [None]:
# using Matplotlib or Seaborn, draw a scree plot of the previous calculated eigen values

plt.scatter(range(1,values.size+1),values)
plt.plot(range(1,values.size+1),values)
plt.hlines(1, 0, values.size, colors='r')
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()


How many to keep if we want to reduce the dataset?

In [None]:
df4_features_columns = list(df4_features.columns)

# use the train_test_split function to create 4 dataset :
# avocado_features_train, avocado_features_test, avocado_target_train, avocado_target_test
df4_features_train, df4_features_test, df4_target_train, df4_target_test = train_test_split(df4_features, df4_target, test_size=0.20, random_state=42)

# initiate the LinearSVC model
linear_svc_model = LinearSVC()
# fit the data on the train data and target
linear_svc_model.fit(df4_features_train,df4_target_train)
#Score the prediction score on the test data and target
print(linear_svc_model.score(df4_features_test,df4_target_test))

df4_target_predicted = linear_svc_model.predict(df4_features_test)

print(f1_score(df4_target_test, df4_target_predicted, average='weighted'))


0.0
0.0




 In this contexte, the algorithm doesn't perform well
The linear SVC use distance calculation to perform its prediction.
we are going to normalize or data in order to reduce the value of the distance


In [None]:

# use the StandardScaler to remove the mean and scaling data to unit variance
scaler = StandardScaler()
#calculating the mean and variance
scaler.fit(df4_features)
#removing the mean and scaling to unit variance
scaled_df4_features = scaler.transform(df4_features)

# Do the procedure to create the 4 datasets, train the model and score its prediction

scaled_df4_features_train, scaled_df4_features_test, df4_target_train, df4_target_test = train_test_split(scaled_df4_features, df4_target, test_size=0.20, random_state=42)

linear_svc_model = LinearSVC()
linear_svc_model.fit(scaled_df4_features_train,df4_target_train)


print(linear_svc_model.score(scaled_df4_features_test,df4_target_test))


print(f1_score(df4_target_test,linear_svc_model.predict(scaled_df4_features_test),average='weighted'))


0.0
0.0




In [None]:
# Now that we have a score for comparaison, we are going to reduced the dataset using the factor
# and we are going to score the prediction using the reduced data set

# use the StandardScaler to remove the mean and scaling data to unit variance
scaler = StandardScaler()
#calculating the mean and variance
scaler.fit(df4_features)
#removing the mean and scaling to unit variance
scaled_df4_features = scaler.transform(df4_features)

# create the FactorAnalysis and fit it on the dataset in order to have a new dataset with only 6 columns
factor_analysis_df4 = FactorAnalysis(n_components = 6).fit(scaled_df4_features)
# create a transformed_scaled_avocado_features dataset using the FactorAnalysis previously created
transformed_scaled_df4_features = factor_analysis_df4.transform(scaled_df4_features)

# Do the procedure to create the 4 datasets, train the model and score its prediction
transformed_scaled_df4_features_train, transformed_scaled_df4_features_test, df4_target_train, df4_target_test = train_test_split(transformed_scaled_df4_features, df4_target, test_size=0.20, random_state=42)

linear_svc_model = LinearSVC()
linear_svc_model.fit(transformed_scaled_df4_features_train,df4_target_train)

#print(linear_svc_model.score(transformed_scaled_avocado_features_test,avocado_target_test))

print(f1_score(df4_target_test, linear_svc_model.predict(transformed_scaled_df4_features_test), average='weighted'))


# what do you think about this score compare to the 'optimal' one ?



0.0018873668188736682




# Step 7

In [None]:
# Step 7
# Try to reduced the dataset using a smarter technique (SVD, FA, PCA)
# Try to improve your SCORE, TIME TRAINED on ENHANCED dataset :
# using your first algorithm
# using other algorithm

# for each model/algorithm, measure the performances with your REDUCED dataset:
# Score of prediction (choose depending on your probleme, precision or F1)
# time to train
# size of reduced dataset

In [None]:
df=df.fillna(0)
#df=df.apply(float)
df['CentralAir']=df['CentralAir'].apply(clean)


In [None]:
def clean(x):
    if x == 'N':
        return 0
    else :
        return 1

In [None]:
from sklearn.decomposition import PCA
from numpy.linalg import svd
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

In [None]:
for i in df.columns:
    df[i]=df[i]-df[i].mean()
    df[i]=df[i]/df[i].var()

In [None]:
X=df.drop(columns='SalePrice')

In [None]:
#
# Scale the dataset; This is very important before you apply PCA
#
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)

pca = PCA()
#
# Determine transformed features
#
X_train_pca = pca.fit_transform(X)
#
# Determine explained variance using explained_variance_ration_ attribute
#
exp_var_pca = pca.explained_variance_ratio_
#
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
#
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
#
# Create the visualization plot
#
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=3)
pca.fit(X)
X_transformed = pca.transform(X)

In [None]:
X_transformed

# Step 8

In [None]:
# Step 8 
# Choose the best reduced algorithm based on metrics (score, time computing, size)
# Explain the business signification for any feature/Factor/Component of this reduced data set

# Step 9

In [None]:
# Step 9
# Go back to step 5 and improve yout metrics !

# Step 10

In [None]:
# Step 10

# find a way to present the work you did (video, notebook, powerpoint presentation, app, ...)