### Import Package

In [52]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
# Define Some Useful Functions

def decomposition(dataset, x_columns, y_columns=[]):
    X = dataset.iloc[:, x_columns]
    Y = dataset.iloc[:, y_columns]

    if len(y_columns) > 0:
        return X, Y
    else:
        return X

from sklearn.model_selection import train_test_split
import time

def split_train_test(x_ary, y_ary, train_size=0.75, random_state=int(time.time())):
    return train_test_split(x_ary, y_ary, test_size=(1-train_size), random_state=random_state)

# One Hot Encoding
def onehot_encoder(ary, columns=[], remove_trap=False):
    df_results = pd.DataFrame()

    # Iterate each column in DataFrame ary
    for i in range(ary.shape[1]):
        # if this column (i) is dummy column
        if i in columns:
            base_name = ary.columns[i]
            this_column = pd.get_dummies(ary.iloc[:, i])
            this_column = this_column.rename(columns={n:"{}_{}".format(base_name, n) for n in this_column.columns})
            # Remove Dummy Variable Trap if needed
            if remove_trap:
                this_column = this_column.drop(this_column.columns[0], axis=1)
        # else this column is normal column
        else:
            this_column = ary.iloc[:, i]
        # Append this column to the Result DataFrame
        df_results = pd.concat([df_results, this_column], axis=1)

    return df_results

# Feature Scaling
from sklearn.preprocessing import StandardScaler

def feature_scaling(fit_ary, transform_arys=None):
    scaler = StandardScaler()
    scaler.fit(fit_ary.astype("float64"))

    if type(transform_arys) is tuple:
        return (pd.DataFrame(scaler.transform(ary.astype("float64")), index=ary.index, columns=ary.columns) for ary in transform_arys)
    else:
        return pd.DataFrame(scaler.transform(transform_arys.astype("float64")), index=transform_arys.index, columns=transform_arys.columns)

import statsmodels.regression.linear_model as sm
import copy

class MultipleRegressor:
    __regressor = None
    
    def __init__(self) :
        self.__regressor = None
    
    @property
    def regressor(self):
        return self.__regressor
    
    def fit(self, x_train, y_train):
        self.__regressor = sm.OLS(exog=x_train, endog=y_train).fit()
        return self

    def predict(self, x_test):
        return self.__regressor.predict(exog=x_test)
    
    def backward_elimination(self, x_train, y_train, significance=0.05, verbose=False):
        # Initialize variables
        final_features = [i for i in range(x_train.shape[1])]
        p_values = [1.0 for i in range(x_train.shape[1])]
        this_features = copy.copy(final_features)
        prev_adj_rsquared = float("-inf")
        this_adj_rsquared = 0
        
        while(True):
            # Show final features first (if verbose)
            if verbose:
                feature_names = [x_train.columns[pos] for pos in final_features]
                print("CUR: {} Adj-RSquared={:.4f}".format(dict(zip(feature_names, ["{:.4f}".format(i) for i in p_values])), prev_adj_rsquared))
                
            # Load the current chosen columns
            x_opt = x_train.values[:, this_features]
            
            # Fit the model with chosen columns
            self.fit(x_train=x_opt, y_train=y_train)
            this_adj_rsquared = self.__regressor.rsquared_adj
            p_values = self.__regressor.pvalues.tolist()
            
            # Show trial features (if verbose)
            if verbose:
                feature_names = [x_train.columns[pos] for pos in this_features]
                print("TRY: {} Adj-RSquared={:.4f}".format(dict(zip(feature_names, ["{:.4f}".format(i) for i in p_values])), this_adj_rsquared))
            
            # If Adjust R-Squared reduced, stop the procedure
            if this_adj_rsquared < prev_adj_rsquared:
                if verbose: print("!!! STOP (Adj RSquared getting lower)\n")
                break
            else:
                final_features = this_features
            
            # Prepare for next round, get the maximum p-value and compare to significance
            this_features = copy.copy(final_features)
            max_pvalue = max(p_values)
            if max_pvalue > significance:            
                max_pvalue_index = p_values.index(max_pvalue)
                del this_features[max_pvalue_index]
                prev_adj_rsquared = this_adj_rsquared
                if verbose: print(">>> GO NEXT (Higher Adj RSquared & has p-value>{})\n".format(significance))
            else:
                if verbose: print("!!! STOP (No more p-value>{})\n".format(significance))
                break
        
        if verbose:
            feature_names = [x_train.columns[pos] for pos in final_features]
            print("*** FINAL FEATURES: {}".format(feature_names))
        return final_features
    
    def r_score(self):
        return self.__regressor.rsquared_adj

import statsmodels.tools.tools as smtools

def add_constant(ary):
    return smtools.add_constant(ary)

### Read data

In [94]:
df = pd.read_csv(r'/content/drive/My Drive/Graduate/AI/HW1/train.csv')
print('viwe the data shape', df.shape)
df.head()

viwe the data shape (957, 15)


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [95]:
# Check Duplicate Values
df[df.duplicated()]

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity


### Check Missing Value

In [96]:
# Check Missing Value
df.isna().sum()

date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      396
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
actual_productivity        0
dtype: int64

In [97]:
# Fillna by mean in feature wip
df['wip'] = df['wip'].fillna((df['wip'].mean()))

In [98]:
# Remove the first column
df = df.drop("date",axis=1)

In [99]:
# Trim the spaces
df.replace('\s+','',regex=True,inplace=True) 

In [100]:
df.head()

Unnamed: 0,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,Quarter1,finishing,Thursday,1,0.75,3.94,1249.989305,960,0,0.0,0,0,8.0,0.8865
2,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


### Input Test Data

In [101]:
# Input Test data
df_test = pd.read_csv(r'/content/drive/My Drive/Graduate/AI/HW1/test.csv')
print(df_test.shape)
df_test.head()

(240, 14)


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,2/26/2015,Quarter4,sweing,Thursday,4,0.8,30.1,437.0,7080,32,0.0,0,2,59
1,2/26/2015,Quarter4,sweing,Thursday,5,0.35,27.48,413.0,6840,38,0.0,0,1,57
2,2/26/2015,Quarter4,finishing,Thursday,10,0.7,2.9,,3360,0,0.0,0,0,8
3,2/26/2015,Quarter4,finishing,Thursday,9,0.75,2.9,,960,0,0.0,0,0,8
4,2/26/2015,Quarter4,sweing,Thursday,1,0.35,26.66,1164.0,6600,23,0.0,0,2,55


In [102]:
# Check Missing Value
df_test.isna().sum()

date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      110
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
dtype: int64

In [103]:
df_test['wip'] = df_test['wip'].fillna((df_test['wip'].mean()))

In [104]:
#Remove the first column
df_test = df_test.drop("date",axis=1)
df_test.replace('\s+','',regex=True,inplace=True)

In [105]:
df_test.head()

Unnamed: 0,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,Quarter4,sweing,Thursday,4,0.8,30.1,437.0,7080,32,0.0,0,2,59
1,Quarter4,sweing,Thursday,5,0.35,27.48,413.0,6840,38,0.0,0,1,57
2,Quarter4,finishing,Thursday,10,0.7,2.9,933.6,3360,0,0.0,0,0,8
3,Quarter4,finishing,Thursday,9,0.75,2.9,933.6,960,0,0.0,0,0,8
4,Quarter4,sweing,Thursday,1,0.35,26.66,1164.0,6600,23,0.0,0,2,55


In [106]:
X = df[df.columns[0:-1]]
y = df[df.columns[-1]]

In [107]:
print(X.shape)
print(df_test.shape)

(957, 13)
(240, 13)


In [108]:
# Row Bind
df_bind = pd.concat([X,df_test])

In [109]:
# One Hot Encoder
df_bind_OHE = onehot_encoder(df_bind, columns = [0,1,2], remove_trap = True)

In [113]:
df_bind_OHE.head()

Unnamed: 0,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5,department_sweing,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,0,0,0,0,1,0,0,1,0,0,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0
1,0,0,0,0,0,0,0,1,0,0,1,0.75,3.94,1249.989305,960,0,0.0,0,0,8.0
2,0,0,0,0,1,0,0,1,0,0,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5
3,0,0,0,0,1,0,0,1,0,0,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5
4,0,0,0,0,1,0,0,1,0,0,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0


In [111]:
# Seperate Train and Test data
X = df_bind_OHE[0:957]
df_test = df_bind_OHE[957:]

In [114]:
# Feature Scaling
X_scaled = feature_scaling(X, transform_arys=(X))
X_test_scaled = feature_scaling(X, transform_arys=(df_test))

In [115]:
# Add the Constant
X_scaled = add_constant(X_scaled)

  x = pd.concat(x[::order], 1)


In [131]:
const = np.ones((240,))

In [134]:
const = pd.DataFrame(const)

In [135]:
# Add the Constant
X_test_scaled = pd.concat([const,X_test_scaled], axis = 1)

In [139]:
X_test_scaled.rename(columns={0:'const'}, inplace = True)

In [140]:
X_test_scaled.head()

Unnamed: 0,const,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5,department_sweing,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,...,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,-0.708696,0.681457,1.347886,-0.524408,0.664992,0.166474,-0.062988,-0.110607,4.511029,1.076501
1,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,-0.418734,-3.898281,1.109934,-0.539889,0.596443,0.356172,-0.062988,-0.110607,2.092511,0.986369
2,1.0,-0.584992,-0.530212,1.830301,-0.219529,-1.190238,-0.426006,-0.449735,2.128801,-0.441312,...,1.031079,-0.336262,-1.122448,-0.204083,-0.397512,-0.845252,-0.062988,-0.110607,-0.326007,-1.221871
3,1.0,-0.584992,-0.530212,1.830301,-0.219529,-1.190238,-0.426006,-0.449735,2.128801,-0.441312,...,0.741116,0.172598,-1.122448,-0.204083,-1.082999,-0.845252,-0.062988,-0.110607,-0.326007,-1.221871
4,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,-1.578583,-3.898281,1.035461,-0.055466,0.527895,-0.118074,-0.062988,-0.110607,4.511029,0.896237


In [141]:
X_scaled.tail()

Unnamed: 0,const,quarter_Quarter2,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5,department_sweing,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,...,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
952,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,0.451154,0.172598,1.284311,-0.215436,0.630718,0.925268,-0.062988,-0.110607,4.511029,1.031435
953,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,1.031079,-0.336262,0.595887,0.219965,-0.431787,-0.845252,-0.062988,-0.110607,2.092511,0.761038
954,1.0,-0.584992,-0.530212,1.830301,-0.219529,0.840168,-0.426006,-0.449735,2.128801,-0.441312,...,-1.288621,-0.336262,1.368775,-0.549565,0.630718,-0.845252,-0.062988,-0.110607,2.092511,1.031435
955,1.0,-0.584992,-0.530212,1.830301,-0.219529,-1.190238,-0.426006,-0.449735,2.128801,-0.441312,...,1.611003,0.681457,-0.968052,0.0,-0.277552,-0.845252,-0.062988,-0.110607,-0.326007,-1.176805
956,1.0,-0.584992,-0.530212,1.830301,-0.219529,-1.190238,-0.426006,-0.449735,2.128801,-0.441312,...,-1.288621,-0.336262,-1.031627,0.0,-1.082999,-0.845252,-0.062988,-0.110607,-0.326007,-1.221871


In [149]:
# Feature Selection
X_scaled = X_scaled.iloc[:,[0,2,3,4,5,8,11,12,13,15,16,17,18,20]]
X_test_scaled = X_test_scaled.iloc[:,[0,2,3,4,5,8,11,12,13,15,16,17,18,20]]

In [150]:
X_scaled.head()

Unnamed: 0,const,quarter_Quarter3,quarter_Quarter4,quarter_Quarter5,department_sweing,day_Thursday,team,targeted_productivity,smv,over_time,incentive,idle_time,idle_men,no_of_workers
0,1.0,-0.530212,-0.546358,-0.219529,0.840168,2.128801,0.451154,0.681457,0.990051,0.664992,2.253158,-0.062988,-0.110607,1.076501
1,1.0,-0.530212,-0.546358,-0.219529,-1.190238,2.128801,-1.578583,0.172598,-1.027994,-1.082999,-0.845252,-0.062988,-0.110607,-1.221871
2,1.0,-0.530212,-0.546358,-0.219529,0.840168,2.128801,1.321041,0.681457,-0.349561,-0.311827,0.735569,-0.062988,-0.110607,-0.207883
3,1.0,-0.530212,-0.546358,-0.219529,0.840168,2.128801,1.611003,0.681457,-0.349561,-0.311827,0.735569,-0.062988,-0.110607,-0.207883
4,1.0,-0.530212,-0.546358,-0.219529,0.840168,2.128801,-0.128771,0.681457,0.966437,-0.808804,0.735569,-0.062988,-0.110607,0.941303


In [142]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from pylab import *

In [None]:
# Grid Search 1

param_grid = {"n_estimators":[100,200,500,1000,2000],
        "max_depth":[10,20,30]}
grid_search = GridSearchCV(RandomForestRegressor(),param_grid,cv = 3)
    
grid_search.fit(X_scaled,y)
grid_search.best_params_

{'max_depth': 10, 'n_estimators': 200}

In [None]:
# Grid Search 2

param_grid2 = {'bootstrap': [True, False],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10]}

grid_search2 = GridSearchCV(RandomForestRegressor(
   max_depth = 10, 
   n_estimators = 1000),param_grid2,cv = 10)
    
grid_search2.fit(X_scaled,y)
grid_search2.best_params_

{'bootstrap': False,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5}

In [None]:
# Grid Search 3

param_grid3 = {'min_samples_leaf': [1,2,3,4,5,6,7,8,9,10],
        'min_samples_split': [1,2,3,4,5,6,7,8,9,10]}

grid_search3 = GridSearchCV(RandomForestRegressor(max_depth = 10, 
                 n_estimators = 200,
                 random_state = 27),param_grid3,cv = 3)

grid_search3.fit(X_scaled,y)
grid_search3.best_params_

30 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/

{'min_samples_leaf': 1, 'min_samples_split': 9}

### RF Model


In [155]:
model = RandomForestRegressor(max_depth = 10, 
                 n_estimators = 100,
                 random_state = 27)

### Cross validation
- usually we will do cross validation to evaluate the generalization of model
- or you can just use ```sklearn.model_selection.train_test_split``` to evaluate your model

In [156]:
from sklearn.model_selection import cross_validate
# directly use cross_val_score
# scores = cross_val_score(model, X_scaled, y, cv=10)
# print("R2: %0.2f (std: %0.2f)" % (scores.mean(), scores.std()))
# print(scores)
# need other scoring
scores = cross_validate(model, X_scaled, y, cv=10,
                       scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_percentage_error'))
print("====================================")
print("R2: {} (std: {})".format(scores['test_r2'].mean(), scores['test_r2'].std()))
print("MSE: {} (std: {})".format(-scores['test_neg_mean_squared_error'].mean(), scores['test_neg_mean_squared_error'].std()))
print("MAPE: {} (std: {})".format(-scores['test_neg_mean_absolute_percentage_error'].mean(), scores['test_neg_mean_absolute_percentage_error'].std()))

R2: 0.3791168606826799 (std: 0.14813705274505917)
MSE: 0.01767786629617494 (std: 0.004562326632186594)
MAPE: 0.1463837775877355 (std: 0.041231539032951785)


'\nR2: 0.3791168606826799 (std: 0.14813705274505917)\nMSE: 0.01767786629617494 (std: 0.004562326632186594)\nMAPE: 0.1463837775877355 (std: 0.041231539032951785\n'

### Prediction

In [153]:
y_pred = model.fit(X_scaled, y).predict(X_test_scaled)

### Output .csv for submission

In [154]:
df_ans = pd.DataFrame(y_pred.astype(float), columns=['actual_productivity'])
df_ans.to_csv(r'/content/drive/My Drive/Graduate/AI/HW1/mySubmission_RF_8.csv',index_label='Id')