In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_engine import outlier_removers as outr
%matplotlib inline
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [None]:
dataset=pd.read_csv('train.csv')
dataset = dataset[['OverallQual','GrLivArea','Neighborhood',
                       'ExterQual','KitchenQual','BsmtQual','GarageCars','Street',
                       'TotalBsmtSF','YearBuilt','GarageFinish','GarageArea','MSSubClass','FullBath',
                       'TotRmsAbvGrd','FireplaceQu','YearRemodAdd','Foundation',
                       'Fireplaces','BsmtFinSF1','LotArea','OpenPorchSF','2ndFlrSF','OverallCond',
                       'BsmtExposure','MasVnrType','MSZoning','HalfBath','LotFrontage','SalePrice']]


features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and dataset[feature].dtypes=='O']
## Replace missing value with a new label
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data
dataset=replace_cat_feature(dataset,features_nan)

## Now lets check for numerical variables the contains missing values
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and dataset[feature].dtypes!='O']
## Replacing the numerical Missing Values
for feature in numerical_with_nan:
## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
## create a new feature to capture nan values. If null replace with 1 else 0
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)

    # set up the capper
capper = outr.OutlierTrimmer(
    distribution='skewed', tail='right', fold=1.5, 
    variables=[feature for feature in dataset.columns if dataset[feature].dtypes!='O'])

# fit the capper
capper.fit(dataset)

# transform the data
dataset= capper.transform(dataset)

dataset = pd.get_dummies(dataset)
import numpy as np
num_features=['LotFrontage', 'LotArea', 'GrLivArea']

for feature in num_features:
    dataset[feature]=np.log(dataset[feature])

feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale]) 
scaler.transform(dataset[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
dataset = pd.concat([dataset[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

y = dataset['SalePrice']
X = dataset.drop(axis=1,columns=['SalePrice'])
#Convert from dataframe to numpy array
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 150, max_depth=None, random_state = 0)
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb #doesn't care about missing values
regressor=xgb.XGBRegressor(n_estimators  = 750,learning_rate = 0.02,max_depth = 3)
xgb = xgb.XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)
from catboost import CatBoostRegressor
cb = CatBoostRegressor(verbose = False)
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, 
                                      min_samples_split=10, loss='huber', random_state =42)
stack_gen = StackingCVRegressor(regressors=(gbr,xgb),
                                meta_regressor=xgb,
                                use_features_in_secondary=True)


#regressor = xgb
regressor.fit(np.array(X_train), np.array(y_train))

y_pred = regressor.predict(np.array(X_test))
np.set_printoptions(precision=2)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('RMSLE(ROOT MEAN SQUEARED LOGARITHMIC ERROR):', np.sqrt(metrics.mean_squared_log_error(y_test, y_pred)))
    

In [23]:
dataset=pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
datasetcopy = test.copy()
dataset = dataset[['OverallQual','GrLivArea','Neighborhood',
                       'ExterQual','KitchenQual','BsmtQual','GarageCars','Street',
                       'TotalBsmtSF','YearBuilt','GarageFinish','GarageArea','MSSubClass','FullBath',
                       'TotRmsAbvGrd','FireplaceQu','YearRemodAdd','Foundation',
                       'Fireplaces','BsmtFinSF1','LotArea','OpenPorchSF','2ndFlrSF','OverallCond',
                       'BsmtExposure','MasVnrType','MSZoning','HalfBath','LotFrontage','SalePrice']]

test = test[['OverallQual','GrLivArea','Neighborhood',
                       'ExterQual','KitchenQual','BsmtQual','GarageCars','Street',
                       'TotalBsmtSF','YearBuilt','GarageFinish','GarageArea','MSSubClass','FullBath',
                       'TotRmsAbvGrd','FireplaceQu','YearRemodAdd','Foundation',
                       'Fireplaces','BsmtFinSF1','LotArea','OpenPorchSF','2ndFlrSF','OverallCond',
                       'BsmtExposure','MasVnrType','MSZoning','HalfBath','LotFrontage']]


features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and dataset[feature].dtypes=='O']
features_nan_test=[feature for feature in test.columns if test[feature].isnull().sum()>0 and test[feature].dtypes=='O']
## Replace missing value with a new label
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data
dataset=replace_cat_feature(dataset,features_nan)
test=replace_cat_feature(test,features_nan_test)

## Now lets check for numerical variables the contains missing values
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>0 and 
                    dataset[feature].dtypes!='O']
numerical_with_nan_test=[feature for feature in test.columns if test[feature].isnull().sum()>0 and 
                    test[feature].dtypes!='O']
## Replacing the numerical Missing Values
for feature in numerical_with_nan:
## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
## create a new feature to capture nan values. If null replace with 1 else 0
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
## Replacing the numerical Missing Values
for feature in numerical_with_nan_test:
## We will replace by using median since there are outliers
    median_value=test[feature].median()
## create a new feature to capture nan values. If null replace with 1 else 0
    test[feature+'nan']=np.where(test[feature].isnull(),1,0)
    test[feature].fillna(median_value,inplace=True)

    

In [25]:
test.shape

(1459, 34)

In [26]:
# set up the capper
capper = outr.OutlierTrimmer(
    distribution='skewed', tail='right', fold=1.5, 
    variables=[feature for feature in dataset.columns if dataset[feature].dtypes!='O'])

# fit the capper
capper.fit(dataset)

# transform the data
dataset= capper.transform(dataset)



In [31]:
dataset.shape

(858, 97)

In [32]:
full_df=pd.concat([dataset,test],axis=0)
full_df = pd.get_dummies(full_df)
full_df =full_df.loc[:,~full_df.columns.duplicated()]

dataset=full_df.iloc[:857,:]
test=full_df.iloc[857:,:]



In [33]:
test.shape

(1459, 97)

In [34]:
import numpy as np
num_features=['LotFrontage', 'LotArea', 'GrLivArea']

for feature in num_features:
    dataset[feature]=np.log(dataset[feature])
    test[feature]=np.log(test[feature])

feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale]) 
scaler.transform(dataset[feature_scale])
scaler.transform(test[feature_scale])

# transform the train and test set, and add on the Id and SalePrice variables
dataset = pd.concat([dataset[['SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

test = pd.concat([pd.DataFrame(scaler.transform(test[feature_scale]), columns=feature_scale)],
                    axis=1)


y = dataset['SalePrice']
X = dataset.drop(axis=1,columns=['SalePrice'])


import xgboost as xgb #doesn't care about missing values
regressor=xgb.XGBRegressor(n_estimators  = 750,learning_rate = 0.02,max_depth = 3)

regressor.fit(np.array(X), np.array(y))

y_pred = regressor.predict(np.array(test))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [35]:
len(y_pred)

1459

In [36]:
datasetcopy['SalePrice'] = y_pred.tolist()
datasetcopy = datasetcopy[['Id','SalePrice']]
datasetcopy.to_csv('XGBoostSubmission.csv', index=False)