In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import skew
from scipy.stats.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['SalePrice'].describe()
sns.distplot(np.log1p(train['SalePrice']))

## Data preprocessing

In [None]:
#Drop the outliars
outliars = [1299, 582, 1191, 524]
for i in outliars:
    train.drop(train[train['Id'] == i].index, inplace=True)

In [None]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
# To make the saleprice more normal, use np.log to transform the feature
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [None]:
delete_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'MasVnrArea', 'Street', 'Utilities']
for i in delete_features:
    all_data.drop(i, axis=1, inplace=True)
all_data = pd.get_dummies(all_data)

In [None]:
#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

In [None]:
#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)

## A single linear model with L2 regularization

In [None]:
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]
print(cv_ridge)
# pick alpha = 10 to do the prediction

In [None]:
clf = Ridge(alpha = 5)
clf.fit(X_train, y)
pred = clf.predict(X_test)

In [None]:
print(np.expm1(pred))

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': np.expm1(pred)})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

## L1 regularization

In [None]:
clf = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]).fit(X_train, y)
pred = clf.predict(X_test)
rmse_cv(clf).mean()

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': np.expm1(pred)})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
coef = pd.Series(clf.coef_, index = X_train.columns)

In [None]:
pd.set_option('max_row', 300) 
coef.sort_values().head(10)

In [None]:
coef.sort_values().tail(10)