In [178]:
import pandas as pd
from typing import List
from sklearn.model_selection import train_test_split

In [179]:
trainData = 'trainFilt.csv'
testData = 'test.csv'
index_col = 'Id'
target_col = 'SalePrice'

In [180]:
def readData(path: str, index_col: str = None, target_col: str = None, exclude: List = []):
    X = pd.read_csv(path, index_col=index_col)
    if exclude:
        X = X.select_dtypes(exclude=exclude)
    if target_col is None:
        return X
    else:
        # Drop rows with missing target
        X = X.dropna(axis=0, subset=[target_col])
        y = X[target_col]
        X = X.drop([target_col], axis=1)
        return X.copy(), y.copy()

In [181]:
def getMissingColumns(df):
    """ Return columns containing missing data """
    return df.columns[df.isna().any()]

In [182]:
def summariseMissing(df):
    
    NcolsNA = df.isna().any(axis=0).sum()
    propColsNA = NcolsNA / len(df.columns)
    NrowsNA = df.isna().any(axis=1).sum()
    propRowsNA = NrowsNA / len(df)
    totalNA = df.isna().to_numpy().sum()
    propTotalNA = totalNA / df.to_numpy().size
    # Get largest integer width for formatting
    w = max([len(str(x)) for x in [NcolsNA, NrowsNA, totalNA]])
    print(f'{"Columns with missing data:":26}{NcolsNA:{w}}\t{propColsNA:6.1%}')
    print(f'{"Rows with missing data:":26}{NrowsNA:{w}}\t{propRowsNA:6.1%}')
    print(f'{"Total missing data:":26}{totalNA:{w}}\t{propTotalNA:6.1%}')
    
    # Return number of missing datapoints per column
    missingCount = df.isna().sum()
    return missingCount[missingCount > 0]

In [183]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config
from xgboost import XGBRegressor

In [184]:
set_config(display='diagram')

In [185]:
def getCategorical(df, maxUnique=None):
    
    catDf = df.select_dtypes(include=['object', 'category'])
    if maxUnique is not None:
        catCols = catDf.columns[catDf.nunique() <= maxUnique]
    else:
        catCols = catDf.columns
    return list(catCols)

In [186]:
X, y = readData(trainData, index_col=index_col, 
                target_col=target_col)

In [187]:
X_test = readData(testData, index_col=index_col)

In [188]:
split = train_test_split(X, y, random_state=0, train_size=0.8, test_size=0.2)
X_train, X_valid, y_train, y_valid = map(lambda x: x.copy(), split)

In [189]:
catCols = getCategorical(X_train, maxUnique=9)

In [190]:
numberCols = list(X_train.select_dtypes(include=['number']).columns)

In [191]:
X_train = X_train[catCols + numberCols]
X_valid = X_valid[catCols + numberCols]
X_test = X_test[catCols + numberCols]

In [192]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())])

In [193]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))])

In [194]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_include='number')),
    ('cat', categorical_transformer, selector(dtype_include=['category', 'object']))])

In [195]:
#model = RandomForestRegressor(n_estimators=100, random_state=0)
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4, random_state=0)

In [196]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',   model)])

In [197]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__n_estimators': [10, 50, 100, 500, 1000],
    'classifier__learning_rate': [0.01, 0.05, 0.1]
}

In [198]:
grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [199]:
grid_search.fit(X_train, y_train)
print(f'Best params:', grid_search.best_params_)

Best params: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 1000, 'preprocessor__num__imputer__strategy': 'mean'}


In [200]:
preds = grid_search.predict(X_valid)

In [103]:
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 17355.469258347603


In [201]:
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 16515.39451787243


In [106]:
import matplotlib.pyplot as plt

In [121]:
plt.line(x=list(y_valid),y=preds, marker='', color='blue', linewidth=1, label="Actual")
#plt.plot(preds.sort_values(), marker='', color='red', linewidth=1, label="Predicted")

AttributeError: module 'matplotlib.pyplot' has no attribute 'line'

In [118]:
y_valid

Id
530     200624
492     133000
460     110000
280     192000
656      88000
         ...  
327     324000
441     555000
1388    136000
1324     82500
62      101000
Name: SalePrice, Length: 292, dtype: int64

In [153]:
X['Garage']

Id
1       True
2       True
3       True
4       True
5       True
        ... 
1456    True
1457    True
1458    True
1459    True
1460    True
Name: Garage, Length: 1460, dtype: bool