# Main Notebook

In [1]:
# Standard Imports
from utils import *

%matplotlib inline

## Import and Explore Data

In [2]:
# Import data (important: y_train has to be a Series object)
X_train_full = pd.read_csv("data/X_train.csv", index_col=0)
y_train_full = pd.read_csv("data/y_train.csv", index_col=0)['Price']
X_test = pd.read_csv("data/X_test.csv", index_col=0)



In [3]:
# Merge new features
def merge_features(X,f,colname):
    if colname in X:
        return X
    df = pd.read_csv(f)
    if len(df.columns) == 2:
        df.columns = ['Id',colname]
    df.drop(columns=["Id"],inplace=True)
    return X.merge(df,left_index=True,right_index=True,how="left")

for f in glob.glob("features/*_train.csv"):
    colname = os.path.basename(f).split("_")[0]
    X_train_full = merge_features(X_train_full,f,colname)

for f in glob.glob("features/*_test.csv"):
    colname = os.path.basename(f).split("_")[0]
    X_test = merge_features(X_test,f,colname)

# reorder test columns in same order as train columns
X_test = X_test[X_train_full.columns]

#convert Date to numerical

X_train_full["Date"] = pd.to_datetime(X_train_full['Date'])
X_train_full["Date"] = X_train_full["Date"].map(datetime.datetime.toordinal)
X_test["Date"] = pd.to_datetime(X_test['Date'])
X_test["Date"] = X_test["Date"].map(datetime.datetime.toordinal)


In [4]:
# Build a validation set
X_train, X_validation, y_train, y_validation = train_test_split(X_train_full, y_train_full, test_size=0.20, random_state=34243)

In [5]:
# X_validation

In [6]:
# Cut data for simplicity
#X_train = X_train.iloc[:1000,:]
#y_train = y_train[:1000]

In [7]:
# X_train.head()

### select_columns

In [8]:
X_train.columns

Index(['Date', 'Lat', 'long', 'PropertyTy', 'Area', 'postCode', 'City',
       'Neighborhood ', 'PropertyTy.1', 'transYear', 'transMonth', 'transDays',
       'transQtr', 'Rooms', 'Baths', 'Parking', 'HasFirepla', 'HasPool',
       'HasGarage', 'HasAirCond', 'cbc', 'trams', 'waterfront',
       'ANDERSON LINK ROAD', 'BALLARAT ROAD', 'BASS HIGHWAY',
       'BAXTER-TOORADIN ROAD', 'CALDER FREEWAY', 'DEER PARK BYPASS',
       'EASTERN FREEWAY', 'EASTLINK', 'GEELONG RING ROAD', 'GEELONG ROAD',
       'GRAHAM STREET', 'HUME FREEWAY', 'MALMSBURY BYPASS', 'MCCARTIN STREET',
       'MCKENZIE STREET', 'METROPOLITAN RING ROAD', 'MONASH FREEWAY',
       'MORNINGTON PENINSULA ROUTE', 'PHILLIP ISLAND ROAD', 'PRINCES FREEWAY',
       'PRINCES FREEWAY.1', 'RACECOURSE ROAD', 'SMITHFIELD ROAD',
       'OUTH GIPPSLAND HIGHWAY', 'SOUTH GIPPSLAND HIGHWAY', 'SPRINGVALE ROAD',
       'STRZELECKI HIGHWAY', 'TULLAMARINE FREEWAY', 'WEST GATE FREEWAY',
       'WESTERN FREEWAY', 'WESTERN LINK TOLLWAY', 'WESTERN 

In [9]:
X_train.columns[10:].append(X_train.columns[[0,4]])

Index(['transMonth', 'transDays', 'transQtr', 'Rooms', 'Baths', 'Parking',
       'HasFirepla', 'HasPool', 'HasGarage', 'HasAirCond', 'cbc', 'trams',
       'waterfront', 'ANDERSON LINK ROAD', 'BALLARAT ROAD', 'BASS HIGHWAY',
       'BAXTER-TOORADIN ROAD', 'CALDER FREEWAY', 'DEER PARK BYPASS',
       'EASTERN FREEWAY', 'EASTLINK', 'GEELONG RING ROAD', 'GEELONG ROAD',
       'GRAHAM STREET', 'HUME FREEWAY', 'MALMSBURY BYPASS', 'MCCARTIN STREET',
       'MCKENZIE STREET', 'METROPOLITAN RING ROAD', 'MONASH FREEWAY',
       'MORNINGTON PENINSULA ROUTE', 'PHILLIP ISLAND ROAD', 'PRINCES FREEWAY',
       'PRINCES FREEWAY.1', 'RACECOURSE ROAD', 'SMITHFIELD ROAD',
       'OUTH GIPPSLAND HIGHWAY', 'SOUTH GIPPSLAND HIGHWAY', 'SPRINGVALE ROAD',
       'STRZELECKI HIGHWAY', 'TULLAMARINE FREEWAY', 'WEST GATE FREEWAY',
       'WESTERN FREEWAY', 'WESTERN LINK TOLLWAY', 'WESTERN RING ROAD',
       'WHITE ROAD', 'WONTHAGGI ROAD', 'BURNLEY TUNNEL',
       'SOUTHERN LINK TOLLWAY', 'busses', 'Date', 'Area'

In [10]:
# Save numerical and categorical columns
def select_columns(df):
    #num_cols = ["Area", "transDays", "Rooms", "Baths", "Parking", 
    #"HasFirepla", "HasPool", "HasGarage", "HasAirCond"]
    # numerical_columns = df.columns[10:]
    numerical_columns = df.columns[10:].append(df.columns[[0,4]])
    categorical_columns = df.columns[[8]]
    return numerical_columns, categorical_columns

In [11]:
numerical_columns, categorical_columns = select_columns(X_train)

In [12]:
# Print input data
print(select_columns(X_train))

(Index(['transMonth', 'transDays', 'transQtr', 'Rooms', 'Baths', 'Parking',
       'HasFirepla', 'HasPool', 'HasGarage', 'HasAirCond', 'cbc', 'trams',
       'waterfront', 'ANDERSON LINK ROAD', 'BALLARAT ROAD', 'BASS HIGHWAY',
       'BAXTER-TOORADIN ROAD', 'CALDER FREEWAY', 'DEER PARK BYPASS',
       'EASTERN FREEWAY', 'EASTLINK', 'GEELONG RING ROAD', 'GEELONG ROAD',
       'GRAHAM STREET', 'HUME FREEWAY', 'MALMSBURY BYPASS', 'MCCARTIN STREET',
       'MCKENZIE STREET', 'METROPOLITAN RING ROAD', 'MONASH FREEWAY',
       'MORNINGTON PENINSULA ROUTE', 'PHILLIP ISLAND ROAD', 'PRINCES FREEWAY',
       'PRINCES FREEWAY.1', 'RACECOURSE ROAD', 'SMITHFIELD ROAD',
       'OUTH GIPPSLAND HIGHWAY', 'SOUTH GIPPSLAND HIGHWAY', 'SPRINGVALE ROAD',
       'STRZELECKI HIGHWAY', 'TULLAMARINE FREEWAY', 'WEST GATE FREEWAY',
       'WESTERN FREEWAY', 'WESTERN LINK TOLLWAY', 'WESTERN RING ROAD',
       'WHITE ROAD', 'WONTHAGGI ROAD', 'BURNLEY TUNNEL',
       'SOUTHERN LINK TOLLWAY', 'busses', 'Date', 'Area

### plot_numerical_columns

In [13]:
# Plot distribution  numerical columns 
def plot_numerical_columns(df, numerical_columns):
    fig = plt.figure(figsize = (20,10))
    ax = fig.gca()
    df[numerical_columns].hist(ax = ax)
    plt.show()

In [14]:
#debug
# X_train

In [15]:
#plot_numerical_columns(X_train, numerical_columns)

## Preprocessing

### check_missing_values

In [16]:
def check_missing_values(df):
    print(df.isnull().sum() * 100 / len(df))

In [17]:
# Check missing values
check_missing_values(X_train)

Date                           0.000000
Lat                            0.000000
long                           0.000000
PropertyTy                     0.000000
Area                           0.000000
postCode                       0.000000
City                           0.000000
Neighborhood                   0.000000
PropertyTy.1                   0.000000
transYear                      0.000000
transMonth                     0.000000
transDays                      0.000000
transQtr                       0.000000
Rooms                         39.994679
Baths                         40.043456
Parking                       44.986254
HasFirepla                    40.010938
HasPool                       40.219351
HasGarage                     39.852780
HasAirCond                    39.829131
cbc                            0.000000
trams                          0.000000
waterfront                     0.000000
ANDERSON LINK ROAD             0.000000
BALLARAT ROAD                  0.000000


In [18]:
# Inputer for numerical variables
num = Pipeline(steps=[
    ('imp', IterativeImputer(estimator=BayesianRidge())),
    ('ss', StandardScaler())
    ])

In [19]:
# One Hot Encoder for categorical data
cat = Pipeline(steps=[
    ('ohe', OneHotEncoder())
    ])

In [20]:
# Preprocess column transformer for preprocessing data
preprocess = ColumnTransformer(
                    transformers=[
                        ('num', num, numerical_columns),
                        ('cat', cat, categorical_columns),
                    ])

## Model

In [21]:
# Set model
model = LGBMRegressor(n_jobs = -1)

In [22]:
# Select parameters to explore
param_grid = {'model__learning_rate': [0.1,0.2],
        'model__num_leaves': [20, 31, 51],
        'model__reg_lambda': [1.0, 2.0],
        'model__max_depth': [31, 51], 
        'model__n_jobs': [-1], 
        'model__n_estimators': [100, 200],
        'model__boosting_type': ["dart"]}

## Fit Predict

In [23]:
# Set cross-validation
cv = KFold()

In [24]:
# Save pipeline
final_pipeline = GridSearchCV(Pipeline(steps=[
        ('preprocess', preprocess),
        ('model', model)
    ]), param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=3)

In [25]:
# Fit pipeline
final_pipeline.fit(X_train, y_train)

# Test pipeline
y_train_hat = final_pipeline.best_estimator_.predict(X_train)
train_rmse = mean_squared_error(y_train, y_train_hat, squared=False)
print('RMSE on training data :', train_rmse)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  3.0min finished


RMSE on training data : 129933.2620616373


In [26]:
# Compute alternative score
alternative_score = np.mean(abs((y_train_hat - y_train) / y_train))
print(alternative_score)

0.12891400340889167


In [27]:
# Validation set
y_validation_hat = final_pipeline.best_estimator_.predict(X_validation)
validation_rmse = mean_squared_error(y_validation, y_validation_hat, squared=False)
print('RMSE on validation data :', validation_rmse)

RMSE on validation data : 167538.1190553986


Validation score: 167538.11

Validation score with neighbourhoods: 167842

Validation score with Long/Lat: 167863.954 


In [28]:
# Predict test values
y_test_hat = final_pipeline.best_estimator_.predict(X_test)

In [29]:
y_test = pd.DataFrame(y_test_hat, columns = ["Predicted"])
y_test['Id'] = y_test.index
y_test = y_test[["Id", "Predicted"]]
y_test.to_csv("predictions/y_test_hat.csv", index = False)

In [30]:
final_pipeline.best_estimator_

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imp',
                                                                   IterativeImputer(estimator=BayesianRidge())),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  Index(['transMonth', 'transDays', 'transQtr', 'Rooms', 'Baths', 'Parking',
       'HasFirepla', 'HasPool', 'HasGarage', 'HasAirCond', 'cbc', 'trams',
       'waterfront', 'ANDERSON LINK ROAD', 'BALLARAT ROAD', 'BASS HIGH...
       'WESTERN FREEWAY', 'WESTERN LINK TOLLWAY', 'WESTERN RING ROAD',
       'WHITE ROAD', 'WONTHAGGI ROAD', 'BURNLEY TUNNEL',
       'SOUTHERN LINK TOLLWAY', 'busses', 'Date', 'Area'],
      dtype='object')),
                                                 ('cat',
                      

In [31]:
final_pipeline.best_estimator_["model"].feature_importances_

array([  37,  164,    1,  465,  605,  369,  144,  107,  134,   59,  152,
        263,  751,   48,   66,   55,   48,   96,   76,  271,  115,  138,
         40,   38,  155,   37,   30,   23,  243,  359,  209,   50,  104,
         75,   76,   47,   79,   83,  236,   25,   59,  131,   51,   71,
        139,   20,   31,  204,  699,  255,  155, 1693,  410,    9],
      dtype=int32)

### Save model

In [34]:
dump(final_pipeline.best_estimator_, "out/models/simple_model.joblib")

['out/models/simple_model.joblib']

## Local Model

In [32]:
class LocalModel():
    """
    Transforms global model to local

    Attributes
    ----------
    model : float
        model to make local
    preprocess : float
        pipeline for preprocessing
    pcent_keep : int
        percentage of observations to keep (eg. pcent_keep=10 keep closest 10%)
    weight_fun : str
        weighting function to apply on the weights, default is 'quadratic'

    Properties
    ----------
    fit_predict : fits and predicts
    """

    def __init__(self, model, preprocess, pcent_keep=10, weight_fun='quadratic'):
        """Initialize"""
        self.model = model
        self.preprocess = preprocess
        self.pcent_keep = pcent_keep
        if weight_fun == 'quadratic':
            self.weight_fun = lambda x: x**2
        elif weight_fun == 'cubic':
            self.weight_fun = lambda x: x**3

    def get_distances(self, X_train, centroid):
        """Compute distances between coordinates of database and point"""
        compute_dist = lambda house : geopy.distance.distance((house.Lat,house.long),(centroid.Lat[0],centroid.long[0])).m
        distances = X_train[['Lat', 'long']].apply(compute_dist, axis=1)
        return pd.DataFrame(distances)

    def compute_centroid(self, dfx):
        """Compute centroid"""
        c = shapely.geometry.MultiPoint(list(dfx.apply(lambda h : Point((h.long,h.Lat)),axis=1))).centroid
        return pd.DataFrame({'Lat':[c.y],'long':[c.x]})

    def sample_selection(self, X_train_clean, y_train, weights, distances):
        """Sample selection"""
        percentile = np.percentile(distances, self.pcent_keep)
        selected = ((distances<percentile) & (distances>0)).to_numpy().flatten()
        X = X_train_clean[selected,:]
        y = y_train[selected]
        weights = sample_weight=weights[selected]
        return X, y, weights.flatten()

    def local_prediction(self, X_train, y_train, X_test, idx, X_train_clean, X_test_clean):
        """Get Local Prediction"""
        centroid = self.compute_centroid(X_test.loc[idx, ['Lat', 'long']])
        distances = self.get_distances(X_train, centroid)
        linear_weights = np.maximum(1 - MinMaxScaler().fit_transform(distances), 0)
        non_linear_weights = np.array([self.weight_fun(w) for w in linear_weights])
        X, y, weights = self.sample_selection(X_train_clean, y_train, non_linear_weights, distances)
        # Fit predict
        self.model.fit(X, y, sample_weight=weights)
        prediction = self.model.predict(X_test_clean[idx, :])
        return prediction

    def fit_predict(self, X_train, y_train, X_test):
        """Fit Predict"""
        # Preprocess
        X_train_clean = self.preprocess.fit_transform(X_train)
        X_test_clean = self.preprocess.fit_transform(X_test)
        y_test_hat = np.zeros(len(X_test))
        # For each point in the test set, get weights, select sample
        hoods = X_test['Neighborhood '].unique()
        for i, hood in enumerate(hoods):
            idx = (X_test['Neighborhood ']==hood).to_numpy()
            y_test_hat[idx] = self.local_prediction(X_train, y_train, X_test, idx, X_train_clean, X_test_clean)
            # Print progress
            sys.stdout.write("\rProgress: {0}%".format((float(i)/len(hoods))*100))
            sys.stdout.flush()
        return y_test_hat

In [None]:
# X_train = X_train_full
# y_train = y_train_full

# X_train.shape

In [35]:
local_model = LocalModel(final_pipeline.best_estimator_["model"], preprocess)
y_test_hat = local_model.fit_predict(X_train_full, y_train_full, X_test)

Progress: 98.75%99999999999%%

In [36]:
# Validation set
#y_validation_hat = final_pipeline.best_estimator_.predict(X_validation)
#validation_rmse = mean_squared_error(y_validation, y_validation_hat, squared=False)
#print('RMSE on validation data :', validation_rmse)

In [33]:
y_test = pd.DataFrame(y_test_hat, columns = ["Predicted"])
y_test['Id'] = y_test.index
y_test = y_test[["Id", "Predicted"]]
y_test.to_csv("predictions/y_test_hat_local.csv", index = False)

### Save model

In [38]:
dump(local_model, "out/models/full_model.joblib")

PicklingError: Can't pickle <function LocalModel.__init__.<locals>.<lambda> at 0x7fc15d71c280>: it's not found as __main__.LocalModel.__init__.<locals>.<lambda>