## Data Analysis

I use the data from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) to explore the potential problems I have mentioned


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

# diaplay all the columns
pd.pandas.set_option("display.max_columns", None)

data = pd.read_csv("train.csv")

data.drop("Id", axis=1, inplace=True)

In [7]:
# get the prediction target
data['SalePrice']

# get the categorical and numerical featuers
car_feas = [fea for fea in data.columns if data[fea].dtype == 'O']
## add MSSubClass to cate features
car_feas = car_feas + ['MSSubClass']
## cast all variables as categorical
data[car_feas] = data[car_feas].astype('O')

num_feas = [fea for fea in data.columns if fea not in car_feas and fea != "SalePrice"]


## Feature Engineering


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
import joblib


X_train,X_test,y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'],axis=1),
    data['SalePrice'],
    test_size=0.1,
    random_state=0 
)

# handle skewed distribution in prediction matrix
y_train = np.log(y_train)
y_test = np.log(y_test)

### processing categorical variables

replace missing value with the string 'missing' or the most frequent category

In [13]:
# find features with missing values
car_feas_with_na = [
    fea for fea in car_feas
    if X_train[fea].isnull().sum()>0
]

In [20]:
# find the features with high ratio with missing values -- replace with missing
fea_str_missing = [
    fea for fea in car_feas_with_na if X_train[fea].isnull().mean() > 0.1
]

# find the features with low ratio with missing values --- replace with most frequent category
fea_str_category = [
    fea for fea in car_feas_with_na if X_train[fea].isnull().mean() < 0.1
]

In [21]:
# replace missing values with string "Missing"
X_train[fea_str_missing] = X_train[fea_str_missing].fillna("Missing")
X_test[fea_str_missing] = X_test[fea_str_missing].fillna("Missing")

In [22]:
for fea in fea_str_category:
    # The mode of a set of values is the value that appears most often
    mode = X_train[fea].mode()[0]
    X_train[fea].fillna(mode, inplace=True)
    X_test[fea].fillna(mode, inplace=True)

### processing numeric variable

replace missing values with mean

In [None]:
for fea in num_feas:
    mean_val = data[fea].mean()
    # add binary missing indicator
    X_train[fea+'_na'] = np.where(X_train[fea].isnull(), 1, 0)
    X_test[fea+'_na'] = np.where(X_test[fea].isnull(), 1, 0)

    # replace missing values by the mean
    X_train[fea].fillna(mean_val, inplace=True)
    X_test[fea].fillna(mean_val, inplace=True)
    

numerical variable transformation

In [None]:
# logarithmic transformation ---- work only on positive numerical
for var in ["LotFrontage", "1stFlrSF", "GrLivArea"]:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

# 

#### which preprocessing method

*converting in the range [0,1] works particularly 
well if you are dealing with a sparse matrix and most of your values are zero*

## Feature Selection


## Model training