## Libraries

In [0]:
import os
import time
from google.colab import drive
import tensorflow as tf
import pickle
import math
import warnings
import pandas as pd
import numpy as np
import zipfile

In [0]:
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

### Mounting Drive

In [3]:
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [4]:
!ls 'gdrive/My Drive/DrivenData/Data'

final_max_values.csv	 test_max_values.csv	   train_labels.csv
final_mean_values.csv	 test_mean_values.csv	   train_meta_data.csv
final_median_values.csv  test_median_values.csv    train_univariate_data.pkl
final_std_values.csv	 test_meta_data.csv	   train_values.csv
mean_values.csv		 test_std_values.csv	   train_values.zip
recipe_metadata.csv	 test_univariate_data.pkl  univariate_data.pkl
submission_format.csv	 test_values.csv
test_lengths.csv	 test_values.zip


## Functions

In [0]:
def read_csv_from_zip(zip_path, data_file, is_tsv = False):

  archive = zipfile.ZipFile(zip_path, 'r')
  if( is_tsv ):
    data = pd.read_csv(archive.open(data_file),delimiter='\t')#,header=None)
  else:
    data = pd.read_csv(archive.open(data_file))#,header=None)
  if(commentary):  
      print("\n")
      print('\033[1m'+"Null values in Data: "+'\033[0m', data.isnull().values.any())
      print("\n")
      print('\033[1m'+"Raw_Data"+'\033[0m'+"\n")
      print(data.head()) 
  return data


def read_csv(file, is_tsv = False):
  if( is_tsv ):
    data = pd.read_csv(file,delimiter='\t')#,header=None)
  else:
    data = pd.read_csv(file)#,header=None)
  if(commentary):  
      print("\n")
      print('\033[1m'+"Null values in Data: "+'\033[0m', data.isnull().values.any())
      print("\n")
      print('\033[1m'+"Raw_Data"+'\033[0m'+"\n")
      print(data.head())
  return data


def data_preparation (zip_path,data_file, features_list, test_split = 0.5, random_state = 37):
  df = read_csv(zip_path+data_file)
  x = df.iloc[:,features_list[0]:features_list[1]]
  y = df.iloc[:,features_list[1]]
  if(commentary):
      print("\n")
      print('\033[1m'+"X - Features"+'\033[0m'+"\n")
      print(x.head())
      print("\n")
      print('\033[1m'+"Y - Target Variable"+'\033[0m'+"\n")
      print(y.head())
  return train_test_split(x, y, test_size = test_split, random_state=random_state)
  

In [0]:
def run_models(X_train, Y_train,Kfold_splits = 10, scoring  = 'r2', random_state = 37):
  
  pipelines = []
  pipelines.append(('\033[1m'+'ScaledLR'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
  pipelines.append(('\033[1m'+'ScaledLASSO'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso(tol = 10))])))
  pipelines.append(('\033[1m'+'ScaledEN'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
  pipelines.append(('\033[1m'+'ScaledKNN'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
  pipelines.append(('\033[1m'+'ScaledCART'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
  pipelines.append(('\033[1m'+'ScaledGBM'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
  pipelines.append(('\033[1m'+'ScaledExtraTrees'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('ET', ExtraTreesRegressor())])))
  pipelines.append(('\033[1m'+'ScaledRandomForest'+'\033[0m', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestRegressor())])))
  
  results = []
  names = []
  print("\n"+'\033[1m'+"Scoring: "+'\033[0m'+scoring+"\n")
  for name, model in pipelines:
      kfold = KFold(n_splits= Kfold_splits, random_state= random_state)
      cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
      results.append(cv_results)
      names.append(name)
      msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
      print(msg)


In [0]:
def run(zip_path,data_file, features_list):
  X_train, X_test, Y_train, Y_test = data_preparation(zip_path,data_file, features_list)
  run_models(X_train,Y_train)
  

## Data Loading

In [0]:
zip_path  = "gdrive/My Drive/DrivenData/Data/"
data_file_1  = "mean_values.csv"
data_file_2  = "final_mean_values.csv"
data_file_3  = "final_median_values.csv"
data_file_4  = "final_max_values.csv"
data_file_5  = "final_std_values.csv"
data_file_6 = "train_meta_data.csv"
data_file_7 = "test_meta_data.csv"

features_list = [1,-1]
commentary = False

X_train, X_test, Y_train, Y_test = data_preparation(zip_path,data_file_6, features_list)

#run(zip_path,data_file_1, features_list)
#run(zip_path,data_file_2, features_list)
#run(zip_path,data_file_3, features_list)
run(zip_path,data_file_6, features_list)
#run(zip_path,data_file_5, features_list)


[1mScoring: [0mr2

[1mScaledLR[0m: 0.174367 (0.086748)
[1mScaledLASSO[0m: 0.158695 (0.103120)
[1mScaledEN[0m: 0.169318 (0.073826)
[1mScaledKNN[0m: 0.618737 (0.112801)
[1mScaledCART[0m: 0.341885 (0.314181)
[1mScaledGBM[0m: 0.523242 (0.243586)
[1mScaledExtraTrees[0m: 0.542287 (0.161582)
[1mScaledRandomForest[0m: 0.566834 (0.149802)


In [0]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error

model = ExtraTreesRegressor(random_state=38)
model.fit(X_train,Y_train)
print(model.score(X_train,Y_train))
#ytrue = Y_train.values
#ypred = model.predict(X_train)
ytrue = Y_test
ypred = np.round(model.predict(X_test))
print(mean_absolute_error(ytrue,ypred))
# cmp = np.full(ytrue.shape[0],290000,dtype=type(ytrue[0]))
# print(np.mean((np.abs(ytrue-ypred) / np.maximum(np.abs(ytrue),cmp))))

# model = RandomForestRegressor(random_state =37)
# model.fit(X_train,Y_train)
# print(model.score(X_train,Y_train))
# #ytrue = Y_train.values
# #ypred = model.predict(X_train)
# ytrue = Y_test
# ypred = model.predict(X_test)
# print(mean_absolute_error(ytrue,ypred))
# # cmp = np.full(ytrue.shape[0],290000,dtype=type(ytrue[0]))
# # print(np.mean((np.abs(ytrue-ypred) / np.maximum(np.abs(ytrue),cmp))))

0.9436350802796405
26.568697729988052


In [0]:
test_data = pd.read_csv("gdrive/My Drive/DrivenData/Data/test_meta_data.csv")
procs, x_data = test_data.values[:,0].reshape((-1,1)).astype(int), test_data.iloc[:,1:]
pred = np.round(model.predict(x_data)).reshape((-1,1)).astype(int)
p = pd.DataFrame(np.concatenate((procs,pred),axis=1),columns=['process_id','lengths'])
p.to_csv("gdrive/My Drive/DrivenData/Data/test_lengths.csv",sep=',',index=None)

In [0]:
compare = pd.DataFrame({'Prediction': ypred, 'Test Data' : Y_test})
compare.head(20)

Unnamed: 0,Prediction,Test Data
504,125.0,127
1179,74.0,71
2664,171.0,173
1843,124.0,150
2257,96.0,96
1411,127.0,128
4782,72.0,67
4924,111.0,143
803,96.0,102
949,305.0,306


In [0]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = dict(n_estimators=np.array([50,100,200,300,400]))
model = RandomForestRegressor(random_state=21)
kfold = KFold(n_splits=10, random_state=21)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))  

-0.038913 (0.437407) with: {'n_estimators': 50}
-0.046943 (0.479795) with: {'n_estimators': 100}
-0.034958 (0.455838) with: {'n_estimators': 200}
-0.021989 (0.446097) with: {'n_estimators': 300}
-0.013676 (0.431160) with: {'n_estimators': 400}
Best: -0.013676 using {'n_estimators': 400}


## Test Data

In [0]:
zip_path  = "gdrive/My Drive/DrivenData/Data/"
test_data = "test_max_values.csv"
df = read_csv(zip_path+test_data)
df.head()



[1mNull values in Data: [0m False


[1mRaw_Data[0m

   process_id          0         1          2         3          4          5  \
0       20000  35069.445  2.577257  25.705296  4.514562  87.832756  38765.914   
1       20006  20039.426  0.593533  17.437067  0.437619   2.289497  24553.312   
2       20007  20191.334  0.402127  23.766638  4.504332   4.734520  24600.332   
3       20009  35177.953  2.726346  27.079716  6.152334  86.783860  48144.530   
4       20010  50057.867  0.490017  29.166666  3.064505  30.653212  48162.617   

     6    7    8 ...         106       107       108        109        110  \
0  1.0  1.0  1.0 ...    0.000000   0.00000   0.00000   0.000000   0.000000   
1  1.0  1.0  1.0 ...    0.000000   0.00000   0.00000   0.000000   0.000000   
2  1.0  1.0  1.0 ...    0.000000   0.00000   0.00000   0.000000   0.000000   
3  1.0  1.0  1.0 ...    0.000000   0.00000   0.00000   0.000000   0.000000   
4  1.0  1.0  1.0 ...   49.568146  31.86849  82.68952  72.764755  

Unnamed: 0,process_id,0,1,2,3,4,5,6,7,8,...,106,107,108,109,110,111,112,113,114,115
0,20000,35069.445,2.577257,25.705296,4.514562,87.832756,38765.914,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20006,20039.426,0.593533,17.437067,0.437619,2.289497,24553.312,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20007,20191.334,0.402127,23.766638,4.504332,4.73452,24600.332,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20009,35177.953,2.726346,27.079716,6.152334,86.78386,48144.53,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20010,50057.867,0.490017,29.166666,3.064505,30.653212,48162.617,1.0,1.0,1.0,...,49.568146,31.86849,82.68952,72.764755,46.265644,44.777725,0.0,0.0,0.0,0.0


In [0]:
Test = df.iloc[:,1:61]
Test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,35069.445,2.577257,25.705296,4.514562,87.832756,38765.914,1.0,1.0,1.0,0.0,...,83.01143,72.92028,46.73428,45.163597,0.0,0.0,0.0,0.0,0.0,0.0
1,20039.426,0.593533,17.437067,0.437619,2.289497,24553.312,1.0,1.0,1.0,0.0,...,82.89931,73.44112,46.812943,47.87146,0.0,0.0,0.0,0.0,0.0,0.0
2,20191.334,0.402127,23.766638,4.504332,4.73452,24600.332,1.0,1.0,1.0,0.0,...,83.27184,72.7322,45.813564,44.796196,0.0,0.0,0.0,0.0,20540.363,0.417535
3,35177.953,2.726346,27.079716,6.152334,86.78386,48144.53,1.0,1.0,1.0,0.0,...,83.15249,72.4754,46.639267,45.068607,1.0,0.0,0.0,0.0,103150.32,2.668186
4,50057.867,0.490017,29.166666,3.064505,30.653212,48162.617,1.0,1.0,1.0,0.0,...,83.203125,73.38686,47.82051,46.03322,0.0,0.0,0.0,0.0,45882.164,0.363064


In [0]:
scaler = StandardScaler().fit(X_train)
rescaled_X_train = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=21, n_estimators=400)
model.fit(rescaled_X_train, Y_train)

# transform the validation dataset
rescaled_X_test = scaler.transform(Test)
predictions = model.predict(rescaled_X_test)

In [0]:
len(predictions)

2967

In [0]:
Y_test.head()

504      0.000000
1179     0.000000
2664    82.208480
1843    81.025750
2257    77.969475
Name: 60, dtype: float64

In [0]:
pred = pd.DataFrame(predictions)
pred.columns = ['final_rinse_total_turbidity_liter']
pred['process_id'] = df['process_id'].astype(int)
columns=['process_id','final_rinse_total_turbidity_liter']
pred = pred.loc[:,['process_id','final_rinse_total_turbidity_liter']]#set_index('process_id')
pred.head()

Unnamed: 0,process_id,final_rinse_total_turbidity_liter
0,20000,-0.074921
1,20006,-0.059573
2,20007,73.038016
3,20009,78.619065
4,20010,82.017012


In [0]:
path = "gdrive/My Drive/DrivenData/"
pred.to_csv(os.path.join(path,"Submissions_GBM.csv"),sep=',',index=None)

In [0]:
param_grid = dict(n_estimators=np.array([50,100,200,300,400]))

def GridSearch(model, param_grid):
  
  scaler = StandardScaler().fit(X_train)
  rescaledX = scaler.transform(X_train)
  model = GradientBoostingRegressor(random_state=21)
  kfold = KFold(n_splits=10, random_state=21)
  grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=kfold)
  grid_result = grid.fit(rescaledX, Y_train)

  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))

  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))