<a href="https://colab.research.google.com/github/SHeidema/AMES-TRIDSA/blob/main/SamplingAndPerformance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Make sure you install the required modules.


In [None]:
pip install vegafusion

Collecting vegafusion
  Downloading vegafusion-1.3.0-py3-none-any.whl (23 kB)
Installing collected packages: vegafusion
Successfully installed vegafusion-1.3.0


In [None]:
#@title Code adjusted from https://github.com/EAISI/machine-learning-with-python-explainers/blob/main/example-solutions/ames-housing-daniel.ipynb
#pip install vegafusion

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import vegafusion as vf
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.utils import estimator_html_repr
from xgboost import XGBRegressor

# check versions
#for library in [pd, sklearn, vf]:
#    print(f"using {library.__name__} version {library.__version__}")


# dataset
URL = "https://github.com/jads-nl/discover-projects/blob/main/ames-housing/AmesHousing.csv?raw=true"

# fill-value for missings in categorical variables
MISSING = "missing"
NONE = "not present"

def standardize_column_names(s):
    return s.replace(" ", "")


def na_means_none(df):
    cols_na_means_none = [
        "Alley",
        "BsmtQual",
        "BsmtCond",
        "BsmtFinType1",
        "BsmtFinType2",
        "FireplaceQu",
        "GarageType",
        "GarageFinish",
        "GarageQual",
        "GarageCond",
        "PoolQC",
        "Fence",
        "MiscFeature",
    ]

    df.loc[:, cols_na_means_none] = df.loc[:, cols_na_means_none].fillna(value=NONE)
    return df


def optimize_memory(df):
    # objects to categorical
    df[df.select_dtypes(include="object").columns] = df.select_dtypes(
        include="object"
    ).astype("category")

    # convert integers to smallest unsigned integer and floats to smallest
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


df = (
    pd.read_csv(URL)
    .rename(columns=standardize_column_names)
    .pipe(na_means_none)
    .pipe(optimize_memory)
)

df_no_outliers = df.query("GrLivArea < 4000")


def na_per_columns(df):
    """Calculates nulls per column"""
    nulls = df.isnull().sum()
    return nulls[nulls != 0].sort_values(ascending=False)


cols_with_nulls = na_per_columns(df)

cols_to_drop = (cols_with_nulls[cols_with_nulls / len(df) > 0.2] / len(df)).index
cols_with_nulls[cols_to_drop]

#Model training and evaluation
df_train, df_test = train_test_split(df_no_outliers, test_size=0.3)


# prepare X and y, using (SalePrice) throughout
X = df_train[df_train.columns.difference(cols_to_drop).drop("SalePrice")]
y = df_train.SalePrice

# same for test set
X_test = df_test[df_test.columns.difference(cols_to_drop).drop("SalePrice")]
y_test = df_test.SalePrice

cat_cols = X.select_dtypes(include="category").columns
num_cols = X.select_dtypes(include="number").columns

  # need to explicitly define categories for pipeline and add MISSING category
categories = [df[col].cat.categories.to_list() for col in cat_cols]
for cat in categories:
    cat.append(MISSING)

  # combine all preprocessing for cat_cols in one pipeline
preprocess_cat_cols = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=MISSING),
    OneHotEncoder(categories=categories),
)

  # same for num_cols
preprocess_num_cols = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy="median"), StandardScaler()
)

  # compose dataset with make_column_transformer
prepare_linear = make_column_transformer(
    (preprocess_num_cols, num_cols), (preprocess_cat_cols, cat_cols), remainder="drop"
)

ols = make_pipeline(prepare_linear, LinearRegression())

ols_scores = cross_validate(
    ols,
    X,
    y,
    cv=5,
    scoring=["neg_mean_squared_error"],
    return_train_score=True,
    return_estimator=True,
)

def get_best_model(cv_scores):
    """
    Return best (most conservative) model from cross_validate object.

    Uses np.argmax to find bottomright point == largest RMLSE
    """
    index = np.argmax(np.sqrt(-cv_scores["train_neg_mean_squared_error"]))
    model = cv_scores["estimator"][index]
    rmsle = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
    return (model, rmsle)


ols_best_model, ols_rmsle = get_best_model(ols_scores)
print(f"RMLSE found after running first block: {ols_rmsle:0.5f}")

### Stratified sampling of the train and test data

In [None]:
y_binned = np.digitize(df_no_outliers['SalePrice'].values,np.percentile(df_no_outliers['SalePrice'].values, range(0, 100, 10)))

#Stratified sampling by using the stratify= argument
df_train, df_test = train_test_split(df_no_outliers, test_size=0.3, stratify=y_binned)

X = df_train[df_train.columns.difference(cols_to_drop).drop("SalePrice")]
y = df_train.SalePrice

X_test = df_test[df_test.columns.difference(cols_to_drop).drop("SalePrice")]
y_test = df_test.SalePrice

ols_scores = cross_validate(
    ols,
    X,
    y,
    cv=5,
    scoring=["neg_mean_squared_error"],
    return_train_score=True,
    return_estimator=True,
)

ols_best_model, ols_rmsle = get_best_model(ols_scores)
print(f"RMLSE after running second block: {ols_rmsle:0.5f}")