## Data Modeling
``` data modeling with advanced regression techniques ```

``` Procedure ```

``` 1. Loading Libraries ```

``` 2. Data Loading (Different CSV Files) ```

``` 3. Defining models ```

``` 4. Cross validation ```

``` 5. Saving Model ``` 

``` 6. Train/Test Split ```

``` 7. Generating report for model perfromance ``` 

### 1. Loading Libraries

In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score

import pickle
import joblib
from os import listdir
from os.path import isfile, join


### 2. Data Loading

In [47]:
model_info = pd.DataFrame(columns=['Model', 'Data File', 'Training Score', 'Testing Score', 'R2 Score', 'Mean Square Error', 'Mean Absolute Error', 'Cross Val Score', 'Score Deviation'])
model_path = './saved models/'
train_path = './Train/'
train_files = [filename for filename in listdir(train_path) if isfile(join(train_path, filename))]
print(train_files)

['X_scaled_selected_train.csv', 'X_Scaled_train.csv', 'X_selected_train.csv', 'X_train.csv']


### 3. Defining Models

``` 1) Linear Regression ```

``` 2) Ridge Regression ```

``` 3) Lasso Regression ```

``` 4) Elastic Regression ```

``` 5) Decision Tree Regressor ```

``` 6) Random Forest Regressor ```

In [48]:
def linear_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model  = LinearRegression()
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Linear_' + filename.split('.csv')[0] + '.pkl'
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2    = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Linear Regression', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

In [49]:
def ridge_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model = Ridge(alpha=0.0005, random_state=42)
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Ridge_' + filename.split('.csv')[0] + '.pkl'
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2          = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Ridge Regression', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

In [50]:
def lasso_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model = Lasso(alpha=0.0005, random_state=42)
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Lasso_' + filename.split('.csv')[0] + '.pkl' 
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2    = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Lasso Regression', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

In [51]:
def elastic_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=42)
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Elastic_' + filename.split('.csv')[0] + '.pkl' 
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2    = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Elastic Regression', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

In [52]:
def decison_tree(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model = DecisionTreeRegressor(random_state=42)
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Decision_Tree_' + filename.split('.csv')[0] + '.pkl' 
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2    = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Decision Tree', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

In [53]:
def random_forest(X, y, X_train, X_test, y_train, y_test, filename, model_info):
    model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    scores = cross_validation(model, X, y)
    model.fit(X_train, y_train)
    save_path = model_path + 'Random_Forest_' + filename.split('.csv')[0] + '.pkl' 
    save_model(model, save_path)
    y_pred      = model.predict(X_test)
    train_Score = model.score(X_train, y_train)
    test_Score  = model.score(X_test, y_test)
    r2    = r2_score(y_test, y_pred)
    mse         = mean_squared_error(y_test, y_pred)
    mae         = mean_absolute_error(y_test, y_pred)
    model_info  = model_info.append({'Model': 'Random Forest', 'Data File': filename,
                                    'Training Score': train_Score, 'Testing Score': test_Score,
                                   'R2 Score': r2, 'Mean Square Error': mse,
                                   'Mean Absolute Error': mae,
                                    'Cross Val Score': scores.mean(),
                                    'Score Deviation': scores.std() * 2}, ignore_index=True)
    return model_info

### 4. Cross Validation

In [54]:
def cross_validation(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=5)
    print('cross_val_scores: {}'.format(scores))
    print('Accuracy: {}, 2 sigma: +/-{}, with confidence interval of 95%'
          .format(round(scores.mean(), 2), round(scores.std() * 2, 2)))
    return scores

### 5. Saving Model

In [55]:
def save_model(model, save_path):
    with open(save_path, 'wb') as file:
        joblib.dump(model, file)

### 6. Train/Test Split

``` 1. looping through all files ```

``` 2. Loading Data ```

``` 3. Train/Test Split ```

In [56]:
for filename in train_files:
    print(filename)
    data = pd.read_csv(filename)
    X = data.drop(['Id','SalePrice'], axis=1)
    Ids = data['Id']
    y = data['SalePrice']
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
#     print('X_train Shape: {}, y_train Shape: {}'.format(X_train.shape, y_train.shape))
#     print('X_test Shape: {}, y_test Shape: {}'.format(X_test.shape, y_test.shape))
    print('Linear Regression')
    model_info = linear_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info)
    print('Ridge Regression')
    model_info = ridge_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info)
    print('Lasso Regression')
    model_info = lasso_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info)
    print('Elastic Regression')
    model_info = elastic_regression(X, y, X_train, X_test, y_train, y_test, filename, model_info)
    print('Decison Tree')
    model_info = decison_tree(X, y, X_train, X_test, y_train, y_test, filename, model_info)
    print('Random Forest Regressor')
    model_info = random_forest(X, y, X_train, X_test, y_train, y_test, filename, model_info)
print('Finish Training')

X_scaled_selected_train.csv
Linear Regression
cross_val_scores: [0.89062113 0.86841613 0.87236647 0.87351624 0.83315247]
Accuracy: 0.87, 2 sigma: +/-0.04, with confidence interval of 95%
Ridge Regression
cross_val_scores: [0.89062215 0.86841534 0.87236565 0.87351636 0.83315359]
Accuracy: 0.87, 2 sigma: +/-0.04, with confidence interval of 95%
Lasso Regression
cross_val_scores: [0.89059463 0.86762119 0.87041368 0.87304591 0.83526926]
Accuracy: 0.87, 2 sigma: +/-0.04, with confidence interval of 95%
Elastic Regression
cross_val_scores: [0.89071639 0.86761044 0.8705155  0.87309824 0.83518023]
Accuracy: 0.87, 2 sigma: +/-0.04, with confidence interval of 95%
Decison Tree
cross_val_scores: [0.71300872 0.7254259  0.75247465 0.77911564 0.74749318]
Accuracy: 0.74, 2 sigma: +/-0.05, with confidence interval of 95%
Random Forest Regressor
cross_val_scores: [0.87107603 0.8464133  0.8730272  0.88443705 0.84538921]
Accuracy: 0.86, 2 sigma: +/-0.03, with confidence interval of 95%
X_Scaled_train.csv

### 7. Generating Report for Model Performance

In [57]:
model_info.to_csv('Model_Performance_split_80_20.csv', index=False)

In [58]:
model_info

Unnamed: 0,Model,Data File,Training Score,Testing Score,R2 Score,Mean Square Error,Mean Absolute Error,Cross Val Score,Score Deviation
0,Linear Regression,X_scaled_selected_train.csv,0.870213,0.879353,0.879353,0.022514,0.11118,0.867614,0.037683
1,Ridge Regression,X_scaled_selected_train.csv,0.870213,0.879352,0.879352,0.022514,0.11118,0.867615,0.037682
2,Lasso Regression,X_scaled_selected_train.csv,0.869934,0.877309,0.877309,0.022896,0.11137,0.867389,0.035904
3,Elastic Regression,X_scaled_selected_train.csv,0.86996,0.877456,0.877456,0.022868,0.111331,0.867424,0.036044
4,Decision Tree,X_scaled_selected_train.csv,1.0,0.759666,0.759666,0.04485,0.153943,0.743504,0.045794
5,Random Forest,X_scaled_selected_train.csv,0.979757,0.879405,0.879405,0.022505,0.103731,0.864069,0.031047
6,Linear Regression,X_Scaled_train.csv,0.895573,0.899534,0.899534,0.018748,0.097927,0.880104,0.058853
7,Ridge Regression,X_Scaled_train.csv,0.906475,0.901613,0.901613,0.01836,0.093684,0.879994,0.059458
8,Lasso Regression,X_Scaled_train.csv,0.897363,0.894114,0.894114,0.01976,0.09757,0.880435,0.058842
9,Elastic Regression,X_Scaled_train.csv,0.897906,0.89443,0.89443,0.019701,0.097261,0.880484,0.059918


### Best Model By Testing Score

In [59]:
model_info.groupby('Model')['Testing Score'].max()

Model
Decision Tree         0.818718
Elastic Regression    0.898171
Lasso Regression      0.898199
Linear Regression     0.901631
Random Forest         0.886053
Ridge Regression      0.901626
Name: Testing Score, dtype: float64

### Best Model By Standard Deviation

``` 95 % confidence interval ```

In [60]:
model_info.groupby('Model')['Score Deviation'].min()

Model
Decision Tree         0.045794
Elastic Regression    0.036044
Lasso Regression      0.035904
Linear Regression     0.037683
Random Forest         0.019376
Ridge Regression      0.037682
Name: Score Deviation, dtype: float64