Fits to full training data, then predicts test data and dumps into a submission directory

**Imports**

In [1]:

import os
import pathlib

import pandas as pd

import sklearn as sk
import sklearn.pipeline

import project_path
import model_wrappers as modelWrapHelp
import preproc_pipes as preProcPipeHelp
import train_pipes as trainPipeHelp



**Configuration variables**

In [2]:
#
TRAIN_PATH = os.path.abspath( os.path.join("..","..","raw_data","train.csv")  )
TEST_PATH  = os.path.abspath( os.path.join("..","..","raw_data","test.csv")  )

SAVE_FOLDER = os.path.abspath( os.path.join("..","..","submissions","submission_xgboost_a") )

In [3]:
#Keywords for xgboost
BOOST_KWARGS = {"max_depth":4, "n_estimators":140,"learning_rate":0.1}

In [4]:
#Also same features used for PCA
FEATS_TO_CLUSTER = ['OverallQual','GrLivArea','TotalBsmtSF','Neighborhood_m10','BsmtFinSF1',
            'GarageFinish','KitchenQual','GarageArea','SaleCondition_m10','LotArea',
            'MoSold','LotShape', 'FireplaceQu','BsmtExposure', 'TotRmsAbvGrd',
            'YearRemodAdd', 'MSSubClass','WoodDeckSF','MasVnrArea']
N_KMEANS_CLUSTERS = 2

#
EXTRA_FEATS = ["BsmtFractUnfurnished", "GarageAreaTimesFinish", "NumbBsmtBath", "NumbBath",
               "OverallQualTimesCond", "PoolQualTimesCond", "PorchLikeArea", "Spaciousness",
               "TotalSFLiv", "TotalSFLivOverLotArea", "YearSold_Fract"]


#
TARG_ENC_FEATS = ["Neighborhood", "Exterior1st", "Condition1", "HouseStyle", "GarageType",
                  "Foundation", "LotConfig", "SaleType", "RoofMatl", "MSZoning", "SaleCondition",
                  "RoofStyle"]
TARG_ENC_MVAL = 10

#
TOP_CMI = ['TotalSFLiv', 'YearBuilt', 'OverallQualTimesCond', 'BsmtUnfSF', 'LotArea',
                  'GarageAreaTimesFinish', 'BsmtQual', 'Neighborhood', 'Neighborhood_m10',
                  'TotRmsAbvGrd', 'PorchLikeArea',
                  'SaleCondition', 'BsmtFinSF1', 'GrLivArea', 'KitchenQual', 'BsmtExposure',
                  'BsmtFinType1', 'FireplaceQu', 'LandContour', 'HeatingQC', 'MasVnrArea']


**Define the features to use**

In [5]:
FEATS = list(TOP_CMI) + ["clusterIdx", "cDist_0", "cDist_1", "clusterMinDist"] + ["pc_0"]

**Create our save folder**

In [6]:
pathlib.Path(SAVE_FOLDER).mkdir(exist_ok=True, parents=True)

**Import data**

In [7]:
RAW_TRAIN = pd.read_csv(TRAIN_PATH)
RAW_TEST = pd.read_csv(TEST_PATH)

**Create our pipeline for all processing**

No need for separate train/processing here; since we're fitting to ALL training data

In [8]:
#Remove anything from the training set i know to be an outlier [from results in other notebooks]
_outlierIDs = [524, 945, 1299]

#Features we add before any factorisation/cleaning
_preFactorFeats = ["FenceQual", "NumbStoreys_fromHouseStyle"]

#
_ordEncoder = preProcPipeHelp.OrdinalEncoder( preProcPipeHelp.getStandardOrdinalEncodeKeys() )

#
_preProcComps = [ ("Add some pre-factorisation features", preProcPipeHelp.EngFeatureAdder(_preFactorFeats) ),
                  ("Remove outliers", preProcPipeHelp.RemoveOutliersById(_outlierIDs) ),
                 ("Replace text values with basic ordinal encoding", _ordEncoder),
                 ("Factorise any remaining categorical groups", preProcPipeHelp.FactorizeRemainingCateGroups() ), ]


In [9]:
#
_targEncodeTrain = trainPipeHelp.MEncodeMultiple(TARG_ENC_FEATS, mVal=TARG_ENC_MVAL)
_pcaPipe = trainPipeHelp.AddPCA(FEATS_TO_CLUSTER, nComponents=1)
_clusterPipe = trainPipeHelp.AddKMeansClusters(FEATS_TO_CLUSTER, N_KMEANS_CLUSTERS,useMinDist=True)

_trainPipeComps = [ ("Impute NaN values for numerical fields", trainPipeHelp.TransformNumericalNaN()),
                    ("Add remaining engineered features", preProcPipeHelp.EngFeatureAdder(EXTRA_FEATS) ),
                    ("Add target encoded features", _targEncodeTrain ),
                    ("Add PCA component(s)", _pcaPipe),
                    ("Add kmeans cluster information", _clusterPipe)
                  ]


In [10]:
PROC_PIPELINE = sk.pipeline.Pipeline(_preProcComps + _trainPipeComps)

**Process train and test data**

In [11]:
PROC_TRAIN = PROC_PIPELINE.fit_transform(RAW_TRAIN)
PROC_TEST = PROC_PIPELINE.transform(RAW_TEST)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inpFrame[inpField].loc[ nanFrame.index ] = inpFrame[replField].loc[ nanFrame.index ]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inpFrame[inpField].loc[ nanFrame.index ] = inpFrame[replField].loc[ nanFrame.index ]


**Create a model**

In [12]:
MODEL = modelWrapHelp.XGBoostWrapper(FEATS, logTarget=True, xgbKwargs=BOOST_KWARGS)
MODEL

**Fit the training data**

In [13]:
MODEL.fit(PROC_TRAIN)

**Check we get a sensible error for the training data**

In [14]:
MODEL.score(PROC_TRAIN)

0.06551185895059905

**Generate predictions for test set**

In [15]:
PROC_TEST["SalePrice"] = MODEL.predict(PROC_TEST)

**Write the predictions to a submission file**

In [16]:
outPath = os.path.join(SAVE_FOLDER, "predictions.csv")
outFrame = PROC_TEST[["Id","SalePrice"]].set_index("Id")
outFrame.to_csv(outPath)

In [17]:
outFrame

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,125745.328125
1462,168103.687500
1463,177132.078125
1464,207296.250000
1465,185117.671875
...,...
2915,90346.320312
2916,77511.054688
2917,169946.453125
2918,124391.210938


**Kaggle Score = 0.13099**