In [210]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [211]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR


In [212]:
PATH = "Data/House/"

In [213]:
!ls {PATH}

data_description.txt
[34mhouse-prices-advanced-regression-techniques[m[m
house-prices-advanced-regression-techniques.zip
sample_submission.csv
test.csv
train.csv


### Look at the data

In [214]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False )

In [215]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [216]:
display_all(df_raw.tail().T)

Unnamed: 0,1455,1456,1457,1458,1459
Id,1456,1457,1458,1459,1460
MSSubClass,60,20,70,20,20
MSZoning,RL,RL,RL,RL,RL
LotFrontage,62,85,66,68,75
LotArea,7917,13175,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


In [217]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

1stFlrSF         0.000000
2ndFlrSF         0.000000
3SsnPorch        0.000000
Alley            0.937671
BedroomAbvGr     0.000000
BldgType         0.000000
BsmtCond         0.025342
BsmtExposure     0.026027
BsmtFinSF1       0.000000
BsmtFinSF2       0.000000
BsmtFinType1     0.025342
BsmtFinType2     0.026027
BsmtFullBath     0.000000
BsmtHalfBath     0.000000
BsmtQual         0.025342
BsmtUnfSF        0.000000
CentralAir       0.000000
Condition1       0.000000
Condition2       0.000000
Electrical       0.000685
EnclosedPorch    0.000000
ExterCond        0.000000
ExterQual        0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
Fence            0.807534
FireplaceQu      0.472603
Fireplaces       0.000000
Foundation       0.000000
FullBath         0.000000
Functional       0.000000
GarageArea       0.000000
GarageCars       0.000000
GarageCond       0.055479
GarageFinish     0.055479
GarageQual       0.055479
GarageType       0.055479
GarageYrBlt      0.055479
GrLivArea   

In [218]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/house-raw')

In [219]:
df_raw = pd.read_feather('tmp/house-raw')

In [220]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [221]:
display_all(df_raw.tail().T)

Unnamed: 0,1455,1456,1457,1458,1459
Id,1456,1457,1458,1459,1460
MSSubClass,60,20,70,20,20
MSZoning,RL,RL,RL,RL,RL
LotFrontage,62,85,66,68,75
LotArea,7917,13175,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


We'll replace categories with their numeric codes, handle missing continuous values, and split the dependent variable into a separate variable.

In [222]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [223]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,1,60,4,65.0,8450,2,0,4,4,1,...,0,0,0,2,2008,9,5,False,False,False
1,2,20,4,80.0,9600,2,0,4,4,1,...,0,0,0,5,2007,9,5,False,False,False
2,3,60,4,68.0,11250,2,0,1,4,1,...,0,0,0,9,2008,9,5,False,False,False
3,4,70,4,60.0,9550,2,0,1,4,1,...,0,0,0,2,2006,9,1,False,False,False
4,5,60,4,84.0,14260,2,0,1,4,1,...,0,0,0,12,2008,9,5,False,False,False


We now have something we can pass to our model, lets race :)

In [224]:
df_x=df.iloc[:,:-1]
df_y=np.log1p(df.iloc[:,-1:])
X_train, X_test, y_train, y_test = train_test_split(df_x.values, df_y.values, test_size=0.33, random_state=42)

## linear regression

In [225]:
reg = LinearRegression().fit(X_train, y_train)

In [226]:
mean_squared_error(y_train,reg.predict(X_train))

0.0024271839437552158

In [227]:
mean_squared_error(y_test,reg.predict(X_test))

0.0024211525061734636

In [228]:
reg.score(X_train, y_train)

0.9014697773244521

## SGD

In [229]:
sdg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000000, tol=1e-6))
sdg.fit(X_train, y_train.reshape((y_train.shape[0],)))

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor(max_iter=1000000, tol=1e-06))])

In [230]:
mean_squared_error(y_train,sdg.predict(X_train))


0.0024532918385968354

In [231]:
mean_squared_error(y_test,sdg.predict(X_test))

0.0024502097126856246

In [232]:
reg.score(X_train, y_train)

0.9014697773244521

## ridge

In [233]:
clf = Ridge(alpha= 0.01)
clf.fit(X_train, y_train)

Ridge(alpha=0.01)

In [234]:
mean_squared_error(y_train,clf.predict(X_train))

0.0024271845230683592

In [235]:
mean_squared_error(y_test,clf.predict(X_test))

0.0024208888392220963

## lasso

In [236]:
lf = Lasso(alpha=0.01)
lf.fit(X_train, y_train)

Lasso(alpha=0.01)

In [237]:
mean_squared_error(y_train,lf.predict(X_train))

0.0030683436859394848

In [238]:
mean_squared_error(y_test,lf.predict(X_test))

0.0025181934296786856

## SVR

In [239]:
sv = SVR(kernel='rbf', C=7, gamma=1e-8)
sv.fit(X_train,y_train.reshape(y_train.shape[0],))

SVR(C=7, gamma=1e-08)

In [240]:
mean_squared_error(y_train,sv.predict(X_train))

0.014387162642419543

In [241]:
mean_squared_error(y_test,sv.predict(X_test))

0.016827059100335835

## poly

In [242]:
svpoly = SVR(kernel='poly', C=1, degree=3)
svpoly.fit(X_train,y_train.reshape(y_train.shape[0],))


SVR(C=1, kernel='poly')

In [243]:
mean_squared_error(y_train,svpoly.predict(X_train))

0.02740683809201351

In [244]:
mean_squared_error(y_test,svpoly.predict(X_test))

0.028792244313770227

# RF

In [245]:
#rf = RandomForestRegressor(n_jobs=-1)i
#%time m.fit(X_train, y_train)
#print_score(m)

# Lets Compare result :)

In [246]:
models = pd.DataFrame({
    'Model': ['linear regression','SGD', 'ridge' ,'lasso','SVR','poly'],
    'Score': [mean_squared_error(y_test,reg.predict(X_test)),
              mean_squared_error(y_test,sdg.predict(X_test)),
              mean_squared_error(y_test,clf.predict(X_test)),
              mean_squared_error(y_test,lf.predict(X_test)),
              mean_squared_error(y_test,sv.predict(X_test)),
              mean_squared_error(y_test,svpoly.predict(X_test))
             ]})
models.sort_values(by='Score', ascending=True)

Unnamed: 0,Model,Score
2,ridge,0.002421
0,linear regression,0.002421
1,SGD,0.00245
3,lasso,0.002518
4,SVR,0.016827
5,poly,0.028792


## Waao  SDG is the best 
#### Hey Kaggler I'm comning :)


In [275]:
test_raw = pd.read_csv(f'{PATH}test.csv', low_memory=False )
tf_x=df.iloc[:,:-1]
tf_x
#df_y=np.log1p(df.iloc[:,-1:])
#X_train, X_test, y_train, y_test = train_test_split(df_x.values, df_y.values, test_size=0.33, random_state=42)
#test_y=np.log1p(test_raw)

#X_train, X_test, y_train, y_test = train_test_split(df_x.values, df_y.values, test_size=0.33, random_state=42)
#test_raw= proc_df(test_raw)

#mytest_data = test_raw.to_numpy()
#mytest_data

#display_all(test_raw.isnull().sum().sort_index()/len(test_raw))
pred = sdg.predict(tf_x)
pred

array([ 0.0109 , -0.01433,  0.01934, ...,  0.00892, -0.04713,  0.03875])

In [276]:
sub = pd.read_csv(f'{PATH}sample_submission.csv')
array_submit = np.concatenate((sub['Id'].to_numpy(dtype=np.int_).reshape((-1, 1)), np.array(sdg.predict(tf_x)).reshape((-1, 1))), axis=1)
array_submit
#sub["SalePrice"] = pred
#sub.to_csv("submission.csv", index=False)
#sub.head()

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1459 and the array at index 1 has size 1460