<a href="https://colab.research.google.com/github/Nov05/DS-Unit-2-Sprint-4-Project/blob/master/notebooks/03_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# created by nov05 on 2019-08-04
# Nov05/DS-Unit-2-Sprint-4-Project/
# notebooks/

In [0]:
!pip install category_encoders

In [0]:
import pandas as pd
import numpy as np
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, \
     mean_squared_error, mean_squared_log_error

In [0]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))
  
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [12]:
###############################################
# Data Loading
###############################################
url_tasks = "https://raw.githubusercontent.com/Derek-Jones/SiP_dataset/master/Sip-task-info.csv"
url_dates = "https://raw.githubusercontent.com/Derek-Jones/SiP_dataset/master/est-act-dates.csv"
tasks  = pd.read_csv(url_tasks, encoding='iso-8859-1')
dates  = pd.read_csv(url_dates, encoding='iso-8859-1')

# merge two datasets
# there are duplicate TaskNumbers in both datasets
# there are duplicate rows in "dates" dataset
alldata = tasks.merge(right=dates[-dates.duplicated()], on='TaskNumber', how='inner')

###############################################
# Data Wrangling
###############################################
breakdown = (alldata[['ProjectCode', 'ProjectBreakdownCode']]
             .groupby('ProjectCode').nunique()
             .drop('ProjectCode', axis=1)
             .reset_index())

def wrangler(X):
## data type conversion  
  X['EstimateOn'] = pd.to_datetime(X['EstimateOn'], format='%d-%b-%y')
  X['StartedOn'] = pd.to_datetime(X['StartedOn'], format='%d-%b-%y')
  X['CompletedOn'] = pd.to_datetime(X['CompletedOn'], format='%d-%b-%y')

## feature engineering
  X['hoursestimatelog'] = np.log1p(X['HoursEstimate'])
  X['daysactual'] = (X['CompletedOn']-X['StartedOn']).dt.days.abs()
  X['estimateonsameday'] = X['EstimateOn']==X['StartedOn']
  ## pandas.DataFrame.merge() behaves awkwardly in a function
  ## so I have to do it in this way
  tmp = (X[['ProjectCode']]
         .merge(right=breakdown, on='ProjectCode', how='left'))
  X['breakdown'] = tmp['ProjectBreakdownCode']
#   tmp = (X[['DeveloperID']].merge(right=performancelevel,
#                                   on='DeveloperID', how='left'))
#   X['performancelevel'] = tmp['performancelevel'].astype(int)

## missing values
  X.replace([np.inf, -np.inf, pd.NaT], np.nan, inplace=True)
wrangler(alldata)
# alldata.to_csv("alldata.csv")

hoursactual_log = np.log1p(alldata['HoursActual'])
hoursestimate_log = np.log1p(alldata['HoursEstimate'])

###############################################
# feature selection
###############################################
target = 'HoursActual'
features = alldata.columns.to_list()
# remove the following features
fs = [ target,
      'TaskNumber', # Drop because our goal is to predict sales for unknown stores
      'Summary', # test column
      'RaisedByID', 'AssignedToID', 'AuthorisedByID', 'DeveloperID',
      'StatusCode', # this is real-time information
      'ProjectCode', 'ProjectBreakdownCode',
      'DeveloperHoursActual', 'TaskPerformance', 'DeveloperPerformance', # highly correlated with 'HoursActual'
      'EstimateOn', 'StartedOn', 'CompletedOn', # independent on dates
      'daysactual', 'estimateonsameday',  
#       'hoursestimatelog',
      'HoursEstimate',
     ]
for f in fs:
  features.remove(f)
print("total features:", len(features), features)

###############################################
# Data Splitting
###############################################
trainval, test = train_test_split(alldata, test_size=0.1)
print("trainval size:", trainval.shape, "test size:", test.shape)
assert alldata.shape[0]==trainval.shape[0]+test.shape[0]

X_trainval = trainval[features]
y_trainval = trainval[target]
y_trainval_log = np.log1p(y_trainval)
X_test = test[features]
y_test = test[target]
y_test_log = np.log1p(y_test)

total features: 5 ['Priority', 'Category', 'SubCategory', 'hoursestimatelog', 'breakdown']
trainval size: (11069, 24) test size: (1230, 24)


In [21]:
###############################################
# feature categories
###############################################
numeric_features = X_trainval[features].select_dtypes(include=np.number).columns.to_list()
categorical_features = X_trainval[features].select_dtypes(exclude=np.number).columns.to_list()
print(len(numeric_features), 'numeric features:', numeric_features)
print(len(categorical_features), 'categorical features:', categorical_features)

# select low cardinality features for one-hot encoding
# select high cardinality features for ordinal encoding
highcardi_features, lowcardi_features = [], []
for col in X_trainval[categorical_features]:
  if len(X_trainval[col].value_counts()) >= 10:
    highcardi_features.append(col)
  elif len(X_trainval[col].value_counts()) >= 2:
    lowcardi_features.append(col)  
# print(len(numeric_features), 'numeric features:', numeric_features)
print(len(highcardi_features), 'high cardinality features:', highcardi_features)
print(len(lowcardi_features), 'low cardinality features:', lowcardi_features)

###############################################
# Pipeline preprocessor
###############################################
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', ce.OneHotEncoder(drop_invariant=True, use_cat_names=True))]) 
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', ce.OrdinalEncoder())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('one', onehot_transformer, categorical_features)
#         ('one', onehot_transformer, lowcardi_features), # categorical_features
#         ('ord', ordinal_transformer, highcardi_features), # categorical_features
    ])
###############################################
# Pipeline fitting
###############################################
random_state = 5
n_jobs = -1
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('rf', RandomForestRegressor(
                               n_estimators=29,
                               max_depth=6,
                               random_state=random_state, 
                               n_jobs=n_jobs)), 
                          ])    
pipeline.fit(X_trainval, y_trainval_log);

3 numeric features: ['Priority', 'hoursestimatelog', 'breakdown']
2 categorical features: ['Category', 'SubCategory']
1 high cardinality features: ['SubCategory']
1 low cardinality features: ['Category']


In [27]:
pred_trainval = pipeline.predict(X_trainval)
print("trainval data RMSE score: %.3f" % rmse(y_trainval_log, pred_trainval))

pred_test = pipeline.predict(X_test)
print("test data RMSE score: %.3f" % rmse(y_test_log, pred_test))

X_all = X_trainval.append(X_test)
y_all = list(y_trainval) + list(y_test)
y_all_log = np.log1p(y_all)
pred_all = pipeline.predict(X_all)
print("all data RMSE score: %.3f" % rmse(y_all_log, pred_all))

trainval data RMSE score: 0.640
test data RMSE score: 0.642
all data RMSE score: 0.640


https://scikit-learn.org/stable/modules/model_persistence.html  

In [28]:
###############################################
# Save Pipeline
###############################################
from joblib import dump, load
dump(pipeline, 'pipeline.joblib') 

['pipeline.joblib']

In [34]:
###############################################
# Load Pipeline
###############################################
rf = load('pipeline.joblib') 

pred_trainval = rf.predict(X_trainval)
print("trainval data RMSE score: %.3f" % rmse(y_trainval_log, pred_trainval))

pred_test = rf.predict(X_test)
print("test data RMSE score: %.3f" % rmse(y_test_log, pred_test))

X_all = X_trainval.append(X_test)
y_all = list(y_trainval) + list(y_test)
y_all_log = np.log1p(y_all)
pred_all = rf.predict(X_all)
print("all data RMSE score: %.3f" % rmse(y_all_log, pred_all))

trainval data RMSE score: 0.640
test data RMSE score: 0.642
all data RMSE score: 0.640
