# Predict Housing Prices - Expensive or not

## Import Packages

In [35]:
import pandas as pd
import seaborn as sns
import numpy as np
# Helper
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


## Import Data

In [3]:
housing = pd.read_csv("../data/iter-6/housing.csv")
housing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

## Duplicated rows

In [4]:
housing.duplicated().sum()

0

## Split data function

In [5]:
def split_data(df):
    X = df.drop(columns="Expensive")
    y = df["Expensive"]
    numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.to_list()
    numerical_features.remove("Expensive")
    categorial_features = df.select_dtypes(exclude=["int64", "float64"]).columns.to_list()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=50)
    
    return X_train, X_test, y_train, y_test, numerical_features, categorial_features

## First Iteration
Only numerical Features

### Split Data

In [6]:
numerical_features = housing.select_dtypes(include=["int64", "float64"]).columns.to_list()
X_train, X_test, y_train, y_test, numerical_features, categorical_features = split_data(housing)

### Missing values

#### Culumns with missing values

In [7]:
na_columns = X_train.columns[X_train.isnull().any()].to_list()
na_columns

['LotFrontage',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'FireplaceQu',
 'MasVnrArea',
 'GarageYrBlt',
 'Alley',
 'MasVnrType',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

## Train Model

### Preprocessing

In [8]:
numeric_pipe = make_pipeline(
    SimpleImputer(),
    StandardScaler()
) 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first", sparse="False", handle_unknown="ignore")
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, numerical_features),
        ("cat_pipe", categoric_pipe, categorical_features)
    ]
)

In [117]:
full_pipeline = Pipeline(steps= [
    ("preprocessor", preprocessor), 
    ("model", RandomForestClassifier())
    ])
param_grid = [
    # {   # Preprocessor
    #     "preprocessor__num_pipe__simpleimputer__strategy": ["median"],
    #     # Used Model
    #     "model": [RandomForestClassifier()],
    #     # Optimized
    #     'model__n_estimators': [1150],
    #     #'model__criterion':['gini'],
    #     'model__max_depth': [25],
    #     #'model__max_features': ["sqrt", "log2", "auto"],  
    #     'model__bootstrap': [False],
    #     'model__min_samples_leaf': [2],
    #     # Still to tune
    #     'model__min_samples_split': [3],

    # },
    {
        "preprocessor__num_pipe__simpleimputer__strategy": ["median", "mean"],
        "model": [LogisticRegression()],
        "model__solver": ["liblinear"],
        #"model__class_weight": [None, "balanced"],
        
    },
    # {
    #     "preprocessor__num_pipe__simpleimputer__strategy": ["median", "mean"],
    #     "model": [LogisticRegression()],
    # },
    # {
    #     "preprocessor__num_pipe__simpleimputer__strategy": ["median"],
    #     "model": [SVC()],
    # },
]
search = GridSearchCV(
    full_pipeline,              # you have defined this beforehand
    param_grid,                 # your parameter grid
    cv=10,                       # the value for K in K-fold Cross Validation
    scoring='balanced_accuracy',         # the performance metric to use, 
    verbose=1,                  # we want informative outputs during the training process
)


In [118]:
search_result = search.fit(X_train, y_train)
search_result.best_params_

Fitting 10 folds for each of 4 candidates, totalling 40 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'model': LogisticRegression(solver='liblinear'),
 'model__solver': 'liblinear',
 'preprocessor__num_pipe__simpleimputer__strategy': 'median'}

In [122]:
columns_to_drop = [
    "split0_test_score", 
    "split1_test_score", 
    "split2_test_score", 
    "split3_test_score", 
    "split4_test_score", 
    "split5_test_score",
    "split6_test_score",
    "split7_test_score",
    "split8_test_score",
    "split9_test_score",
    "std_fit_time",
    "std_score_time",
    ]

pd.DataFrame(search_result.cv_results_).sort_values("rank_test_score").drop(columns=columns_to_drop)

Unnamed: 0,mean_fit_time,mean_score_time,param_model,param_model__solver,param_preprocessor__num_pipe__simpleimputer__strategy,params,mean_test_score,std_test_score,rank_test_score
0,0.074026,0.015601,LogisticRegression(solver='liblinear'),liblinear,median,{'model': LogisticRegression(solver='liblinear...,0.876277,0.049087,1
1,0.06254,0.013661,LogisticRegression(solver='liblinear'),liblinear,mean,{'model': LogisticRegression(solver='liblinear...,0.876277,0.049087,1
2,0.11908,0.014244,LogisticRegression(solver='liblinear'),lbfgs,median,{'model': LogisticRegression(solver='liblinear...,0.873152,0.047411,3
3,0.108415,0.013404,LogisticRegression(solver='liblinear'),lbfgs,mean,{'model': LogisticRegression(solver='liblinear...,0.873152,0.047411,3


In [123]:
pred = search_result.predict(X_test)
balanced_accuracy_score(pred, y_test)



0.9346740273396424

### Run Pipeline with all data

In [103]:
X = housing.drop(columns="Expensive")
y = housing["Expensive"]
complete_search_result = search.fit(X,y)


Fitting 10 folds for each of 2 candidates, totalling 20 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [95]:
final_data = pd.read_csv("../data/test.csv")
final_data["Expensive"] = complete_search_result.predict(final_data)

solution = final_data[["Id", "Expensive"]]
solution.to_csv("../data/predictions/solution.csv",index=False)




In [93]:
solution.sum()

Id             3195210
final_preds        180
dtype: int64

In [92]:
housing.Expensive.sum()

217