<a href="https://colab.research.google.com/github/Torney32/Fall-2022-Junior-Clinic/blob/main/Fall_Junior_Clinic_Final_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#### importing libraries needed ####
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder 
from zlib import crc32
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import date
from google.colab import drive
drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/Fall\ 2022\ Semester/Machine\ Learning/Zillow-Data

Mounted at /content/gdrive
/content/gdrive/MyDrive/Fall 2022 Semester/Machine Learning/Zillow-Data


In [None]:
#### obtain the data ####
train16 = pd.read_csv('Data/train_2016_v2.csv' , parse_dates=["transactiondate"]) 
train17 = pd.read_csv('Data/train_2017.csv' , parse_dates=["transactiondate"]) 

properties16 = pd.read_csv('Data/properties_2016.csv') 
properties17 = pd.read_csv('Data/properties_2017.csv') 

# Left join will ignore all properties that do not have a logerror (target variable) associated with them

train16 = pd.merge(train16, properties16, how = 'left', on = 'parcelid')
train17 = pd.merge(train17, properties17, how = 'left', on = 'parcelid')

train = pd.concat([train16, train17], ignore_index=True)
properties = pd.concat([properties16, properties17], ignore_index=True)

sample = pd.read_csv('Data/sample_submission.csv') 
sample= sample.rename(columns={'ParcelId': 'parcelid'}) # To make it easier for merging datasets on same column_id later

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#### analyzing the data ####                     # (rows/homes, columns/data points)
print("Training Size:" + str(train.shape))       # Training Size:(167888, 60)
print("Property Size:" + str(properties.shape))  # Property Size:(2985217 * 2 = 5970434, 58)
print("Sample Size:" + str(sample.shape))        # Sample Size:(2985217, 7)

Training Size:(167888, 60)
Property Size:(5970434, 58)
Sample Size:(2985217, 7)


In [None]:
# Drop rows with 75% or more NaN values
drop_per = 75
count_of_nan =  int(((100-drop_per)/100)*train.shape[1] + 1)
train = train.dropna(axis=0, thresh=count_of_nan)
train.shape #(167854, 60) So only 34 rows which meet this criteria

(167854, 60)

In [None]:
# Creating variable of all log-errors to be used later for evaluation
y_all = train.logerror

In [None]:
#Make sure there is no error in the dataset when splitting
def error_checking(id, split):                                                         
    return crc32(np.int64(id)) & 0xffffffff < split * 2**32     

#Split the dataset by parcelid
def data_split(train, split, parcelid):                                                  
    identifier = train[parcelid]
    for_testing = identifier.apply(lambda id_: error_checking(id_, split))
    
    x_train = train.loc[~for_testing]
    x_test = train.loc[for_testing]

    # Remove outliers from the training data
    y = train.logerror
    log_error_max = y.mean() + 2.5*y.std()
    log_error_min = y.mean() - 2.5*y.std()

    y_train = y[y > log_error_min]
    y_train = y[y < log_error_max]

    x_train = x_train[x_train.logerror > log_error_min]
    x_train = x_train[x_train.logerror < log_error_max]
    
    # Create outputs ready to be used
    x_train_op = x_train.drop("logerror", axis=1)
    y_train_op = x_train.logerror
    x_test_op = x_test.drop("logerror", axis=1)
    y_test_op = x_test.logerror
    return x_train_op, x_test_op, y_train_op, y_test_op

x_train, x_test, y_train, y_test = data_split(train, 0.2, "parcelid")

In [None]:
print(f"Training Dataset Shape: {x_train.shape}")    # ~80% of instances are in training
print(f"Test Dataset Shape: {x_test.shape}")         # 20% of instances are in test

Training Dataset Shape: (131617, 59)
Test Dataset Shape: (33607, 59)


In [None]:
# Class to call later to drop unwanted features
class FeatureDropping(BaseEstimator, TransformerMixin):     
    def __init__(self, labels):
        self.labels = labels
    def fit(self, input, y=None):
        return self 
    def transform(self, input): 
        output = input.drop(self.labels, axis=1)
        return output

In [None]:
# Class to create feature of the year and month
class CreateMonthAndYearFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, input): 
        transdate = pd.to_datetime(input['transactiondate']).dt # Grab transactiondate feature
        input['transaction_year_month'] = ((transdate.year - 2016)*12 + transdate.month).astype('category') # Create the year/month feature starting from Jan 2016
        output = input.drop(['transactiondate'], axis=1) # Drop transactiondate as it is now replaced

        return output

In [None]:
# Calculate the mean average error of the input dataset
def evaluate(algorithm, input, y_true):
    for model in algorithm: 
        y_pred = model.predict(input)
        mae = mean_absolute_error(y_true, y_pred)
        print(f"Algorithm: {model}")
        print(f"MAE: {mae}")

In [None]:
# Display the statistics from cross validation
def stats(algorithm, mae):
    print("Algorithm:", algorithm)
    print("\nMAE:", mae)
    print("\nAverage:", mae.mean())
    print("\nStandard deviation:", mae.std())
    
# k-fold cross validation with MAE calculation
def cross_validation(algorithm, input, y, cv=10, fit_params=None):
    for model in algorithm: 
        mae = -cross_val_score(model, input, y, scoring="neg_mean_absolute_error", cv=cv, fit_params=fit_params)
        stats(model, mae)

In [None]:
# Create new variables to go through the pipeline
pipe_train = x_train.copy()
pipe_test = x_test.copy()
all_parcels = train.drop("logerror", axis=1)

In [None]:
# Data Preparation
# Drop these features which are duplicated or have over 75% of the data missing
drop_features = ["finishedsquarefeet13", "finishedsquarefeet15", "finishedfloor1squarefeet", "finishedsquarefeet50",
             "storytypeid", "architecturalstyletypeid", "buildingclasstypeid", "typeconstructiontypeid", "finishedsquarefeet6",
             "pooltypeid10", "pooltypeid7", "hashottuborspa", "fireplaceflag", "threequarterbathnbr", "calculatedbathnbr",
             "fullbathcnt", "numberofstories", "rawcensustractandblock", "censustractandblock",
             "finishedsquarefeet12", "taxvaluedollarcnt", "taxamount", "assessmentyear", "roomcnt",
             "propertyzoningdesc", "regionidneighborhood", "regionidzip", "taxdelinquencyyear",
             "propertycountylandusecode", "regionidcity", "parcelid", "basementsqft", "yardbuildingsqft26"
            ]

feature_dropping = FeatureDropping(labels=drop_features)

# Creating year and month feature
year_month_create = CreateMonthAndYearFeature()

# One hot encode the data
encoding_features = ['transaction_year_month', 
            'airconditioningtypeid', 'buildingqualitytypeid', 
            'decktypeid', 'fips', 'heatingorsystemtypeid', 'pooltypeid2',
            'propertylandusetypeid', 'regionidcounty',
            'taxdelinquencyflag']
            
feature_encoder = ColumnTransformer([
    ("ohe_cats", OneHotEncoder(handle_unknown='ignore'), encoding_features)
],
    remainder='passthrough'
)

In [None]:
data_processor = Pipeline([
    ('feature_dropping', feature_dropping),
    ('year_month_creator', year_month_create),
    ('feature_encoder', feature_encoder),
])

data_pipeline = data_processor.fit(pipe_train)
pipe_train = data_processor.transform(pipe_train)
pipe_test = data_processor.transform(pipe_test)
all_parcels = data_processor.transform(all_parcels)

In [None]:
# Initialize XGBoost and AdaBoost Models
params = {
    'learning_rate': 0.3,  #impacts how fast the model learns
    'n_estimators': 10000, #times the model is optimized
    'random_state': 42,    #keeps the same splits for every fold in cross validation to generate repeated data
}

xgb_base = xgb.XGBRegressor(**params)
ada_base = AdaBoostRegressor(**params)

In [None]:
# XGBoost fit with validation using the test set
fit_params={'early_stopping_rounds': 10, 
            'eval_metric': 'mae',
            'eval_set': [[pipe_test, y_test]]}

xgb_base.fit(pipe_train, y_train, **fit_params)

[0]	validation_0-mae:0.352861
Will train until validation_0-mae hasn't improved in 10 rounds.
[1]	validation_0-mae:0.254539
[2]	validation_0-mae:0.187748
[3]	validation_0-mae:0.142998
[4]	validation_0-mae:0.113697
[5]	validation_0-mae:0.095101
[6]	validation_0-mae:0.083761
[7]	validation_0-mae:0.077169
[8]	validation_0-mae:0.073505
[9]	validation_0-mae:0.0715
[10]	validation_0-mae:0.070416
[11]	validation_0-mae:0.069811
[12]	validation_0-mae:0.069494
[13]	validation_0-mae:0.069292
[14]	validation_0-mae:0.069173
[15]	validation_0-mae:0.069108
[16]	validation_0-mae:0.069061
[17]	validation_0-mae:0.069028
[18]	validation_0-mae:0.069007
[19]	validation_0-mae:0.068982
[20]	validation_0-mae:0.068978
[21]	validation_0-mae:0.068975
[22]	validation_0-mae:0.068971
[23]	validation_0-mae:0.068967
[24]	validation_0-mae:0.068958
[25]	validation_0-mae:0.068951
[26]	validation_0-mae:0.068926
[27]	validation_0-mae:0.068925
[28]	validation_0-mae:0.068921
[29]	validation_0-mae:0.068917
[30]	validation_0-

XGBRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)

In [None]:
#Evaluating where ohe placed Nan columns

len(np.where(np.isnan(pipe_train.data) == True)[0])
np.where(np.isnan(pipe_train.sum(axis=0)) == True)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([81, 82, 83, 84, 87, 88, 89, 90, 91, 92, 93, 94]))

In [None]:
# Creating new variables for AdaBoost which are dense versions to remove Nan columns
# from ohe

x_train_ada = pipe_train.todense()[:, :80]
x_test_ada = pipe_test.todense()[:, :80]
x_all_ada = all_parcels.todense()[:, :80]

In [None]:
# Fit AdaBoost

ada_base.fit(x_train_ada, y_train)



AdaBoostRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)

In [None]:
# XGBoost cross val using all data

fit_params={'early_stopping_rounds': 10, 
            'eval_metric': 'mae',
            'verbose': False,
            'eval_set': [[all_parcels, y_all]]}

cross_validation([xgb_base], all_parcels, y_all, cv=10, fit_params=fit_params)

Algorithm: XGBRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)

MAE: [0.0741531  0.07209707 0.06789544 0.06539838 0.06698872 0.07131061
 0.07040414 0.06955762 0.06622741 0.07319263]

Average: 0.06972251219275746

Standard deviation: 0.002864436272882789


In [None]:
# XGBoost cross val using dense data required by AdaBoost

fit_params={'early_stopping_rounds': 10, 
            'eval_metric': 'mae',
            'verbose': False,
            'eval_set': [[x_all_ada, y_all]]}

cross_validation([xgb_base], x_all_ada, y_all, cv=10, fit_params=fit_params)

Algorithm: XGBRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)

MAE: [0.0738554  0.07075051 0.06710109 0.06514223 0.06661816 0.07123644
 0.07020288 0.06999901 0.06644837 0.07782619]

Average: 0.06991802902481456

Standard deviation: 0.0036637676855216027


In [None]:
### AdaBoost Cross Val of the entire dataset

cross_validation([ada_base], x_all_ada, y_all, cv=10)



Algorithm: AdaBoostRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)

MAE: [0.08959906 0.07928853 0.08062522 0.08232769 0.08222798 0.13087609
 0.12893187 0.07930143 0.08504402 0.07865351]

Average: 0.09168753875524668

Standard deviation: 0.019359448945422467




In [None]:
# XGBoost assessment of the mae of the test set after training

evaluate([xgb_base], pipe_test, y_test)

Algorithm: XGBRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)
MAE: 0.06887246418322239


In [None]:
# AdaBoost assessment of the mae of the test set after training

evaluate([ada_base], x_test_ada, y_test)



Algorithm: AdaBoostRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)
MAE: 0.07079247081225327


In [None]:
# XGBoost assessment of the mae for the entire dataset

evaluate([xgb_base], all_parcels, y_all)

Algorithm: XGBRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)
MAE: 0.06811895784601586


In [None]:
# AdaBoost assessment of the mae for the entire dataset

evaluate([ada_base], x_all_ada, y_all)



Algorithm: AdaBoostRegressor(learning_rate=0.3, n_estimators=10000, random_state=42)
MAE: 0.07019324500598069
