In [1]:
import sys
sys.path.append("../src")

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import util as util
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn_features.transformers import DataFrameSelector
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

## sklearn -- metrics
from sklearn.metrics import mean_squared_error, r2_score

## sklearn -- Models
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

## Xgboost
from xgboost import XGBRegressor

### Load Config

In [3]:
config_data = util.load_config()

In [4]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    # Load every set of data
    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    return x_train, x_test, y_train, y_test

In [5]:
x_train, x_test, y_train, y_test = load_dataset(config_data)


## Dealing with Nulls

In [6]:
## Separete the columns according to type (numerical or categorical)
num_cols = [col for col in x_train.columns if x_train[col].dtype in ['float32', 'float64', 'int32', 'int64']]
categ_cols = [col for col in x_train.columns if x_train[col].dtype not in ['float32', 'float64', 'int32', 'int64']]


print('Numerical Columns : \n', num_cols)
print('**'*40)
print('Categorical Columns : \n', categ_cols)

Numerical Columns : 
 ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedroms_per_rooms', 'population_per_household']
********************************************************************************
Categorical Columns : 
 ['ocean_proximity']


## Pipeline Numerical

In [7]:
num_pipeline = Pipeline([
                        ('selector', DataFrameSelector(num_cols)),    ## select only these columns
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
        ])


## Pipeline Categorical

In [8]:
categ_pipeline = Pipeline(steps=[
            ('selector', DataFrameSelector(categ_cols)),    ## select only these columns
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('OHE', OneHotEncoder(sparse=False))
])



## Connect 2 pipeline

In [9]:
total_pipeline = FeatureUnion(transformer_list=[
                                            ('num_pipe', num_pipeline),
                                            ('categ_pipe', categ_pipeline)
                                               ]
                             )

In [10]:
x_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedroms_per_rooms,population_per_household,ocean_proximity
4602,-118.27,34.05,12.0,535.0,328.0,1194.0,365.0,1.2012,1.465753,0.613084,3.271233,1H OCEAN
6978,-118.03,33.97,32.0,2468.0,552.0,1190.0,479.0,3.8275,5.152401,0.223663,2.484342,1H OCEAN
16415,-121.26,37.88,42.0,465.0,93.0,256.0,93.0,3.1719,5.000000,0.200000,2.752688,INLAND
2549,-124.17,40.79,43.0,2285.0,479.0,1169.0,482.0,1.9688,4.740664,0.209628,2.425311,NEAR OCEAN
11025,-117.82,33.79,26.0,2641.0,633.0,3657.0,617.0,4.1339,4.280389,0.239682,5.927066,1H OCEAN
...,...,...,...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,6.129032,0.151128,3.032258,1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,6.868597,0.184825,3.904232,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,3.986717,0.270823,3.332068,1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,6.395349,0.166993,3.178891,1H OCEAN


In [12]:
X_train_feng = total_pipeline.fit_transform(x_train)
X_test_feng = total_pipeline.transform(x_test)                 ### Every thing is processed :D

In [14]:
util.pickle_dump(X_train_feng, "../data/processed/x_train_feng.pkl")
util.pickle_dump(y_train, "../data/processed/y_train_feng.pkl")

util.pickle_dump(X_test_feng, "../data/processed/x_test_feng.pkl")
util.pickle_dump(y_test, "../data/processed/y_test_feng.pkl")

In [15]:
X_test_feng

array([[ 0.28503676,  0.1950594 , -0.28418942, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06065351, -0.23550335,  0.1123855 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.42526222,  1.0093846 ,  1.85731512, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.20026975, -0.67074613,  1.38142522, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.08284386, -0.66138607, -0.91870928, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.83851543, -0.70818637,  0.66759038, ...,  0.        ,
         0.        ,  0.        ]])

In [26]:
x_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedroms_per_rooms,population_per_household,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,4.192201,,3.877437,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,5.039384,,2.679795,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,3.977155,,1.360332,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,6.163636,,3.444444,1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,5.492991,,2.483645,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...,...,...
19053,-121.69,38.16,33.0,1808.0,363.0,824.0,340.0,3.2937,5.317647,0.200774,2.423529,INLAND
9444,-119.82,37.57,13.0,1713.0,340.0,643.0,241.0,2.6620,7.107884,0.198482,2.668050,INLAND
20266,-119.18,34.21,46.0,2062.0,484.0,1522.0,469.0,3.0870,4.396588,0.234724,3.245203,NEAR OCEAN
13522,-117.41,34.23,17.0,889.0,131.0,439.0,141.0,6.1426,6.304965,0.147357,3.113475,INLAND
