In [0]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from google.colab import drive
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
from sklearn.preprocessing import LabelEncoder

In [2]:
drive.mount("/content/gdrive")
path = "gdrive/My Drive/DrivenData/Data/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
if 'COLAB_TPU_ADDR' in os.environ:
    TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
else:
    TPU_WORKER = None

# Model Codes

In [0]:
tl = pd.read_csv(os.path.join(path,"train_labels.csv"))
features = ['process_id', 'phase', 'return_turbidity', 'return_flow','target_time_period']

In [0]:
class generator(tf.keras.utils.Sequence):
    def __init__(self, data, p_set, labels, batch_size, max_steps):
        self.data = data
        self.labels = labels
        self.p_set = p_set
        self.batch_size = batch_size
        self.max_steps = max_steps

    def __len__(self):
        return int(np.ceil(len(self.p_set)/self.batch_size))

    def padding(self,f):
        diff = self.max_steps - f.shape[0]
        pad = np.zeros((diff,f.shape[1]-2))
        f = np.concatenate((f[:,1:-1],pad),axis=0)
        return np.reshape(f,(self.max_steps,f.shape[1]))
    
    def __getitem__(self, idx):
        pid = self.p_set[idx*self.batch_size:(idx+1)*self.batch_size]
        x,y = [],[]
        for p in pid:
            try:
                x.append(self.padding(self.data[self.data['process_id']==p].values))
                y.append(self.labels[self.labels['process_id']==p]['final_rinse_total_turbidity_liter'].values[0])
            except:
                pass
        print(np.array(x).shape)
        return np.array(x), np.array(y)

In [0]:
model_name = 'lim_rec_model_'
def make_model(inp_shape):
    inp = tf.keras.layers.Input(shape=inp_shape)
#     x = tf.keras.layers.Conv1D(64,3,2)(inp)
#     x = tf.keras.layers.MaxPool1D(2)(x)
    x = tf.keras.layers.LSTM(32, return_sequences=True)(inp)
    x = tf.keras.layers.LSTM(16)(x)
    
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    x = tf.keras.layers.Dense(8,activation='relu')(x)
    out = tf.keras.layers.Dense(1)(x)
    
    model = tf.keras.models.Model(inputs=[inp], outputs=[out])
    model.summary()
    return model

def ape(ytrue, ypred):
    cmp = np.full(8,290000,dtype=type(ytrue[0]))
    return K.abs(ytrue-ypred) / K.maximum(K.abs(ytrue),cmp)
    
def mape(ytrue, ypred):
    cmp = np.full(8,290000,dtype=type(ytrue[0]))
    return K.mean((K.abs(ytrue-ypred) / K.maximum(K.abs(ytrue),cmp)))

# Training

In [0]:
tv = pd.read_csv(os.path.join(path,"train_values.csv"),usecols=features)
tv = tv[tv['target_time_period']==False]
tv['final_turbidity'] = tv['return_flow']*tv['return_turbidity']

In [0]:
cols_enc = ['phase']#,'supply_pump','supply_pre_rinse','supply_caustic','return_caustic','supply_acid','return_acid','supply_clean_water','return_recovery_water','return_drain','object_low_level','tank_lsh_caustic','tank_lsh_clean_water','target_time_period']
cols_norm = ['return_turbidity', 'return_flow', 'final_turbidity']#['supply_flow', 'supply_pressure', 'return_temperature', 'return_conductivity', 'return_turbidity', 'return_flow', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water', 'tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid', 'tank_concentration_caustic', 'tank_concentration_acid']
le = LabelEncoder()
for c in cols_enc:
    tv[c] = le.fit_transform(tv[c])
tv[cols_norm] = (tv[cols_norm]-tv[cols_norm].min())*(10)/(tv[cols_norm].max() - tv[cols_norm].min())

In [0]:
p_set = tv['process_id'].unique()
plens = []
for i in p_set:
    plens.append((i,tv[tv['process_id']==i].shape[0]))
    
p1,p2,p3,p4,p5,p6,p7 = [],[],[],[],[],[],[]
for i,l in plens:
    if l<=1000:
        p1.append(i)
    elif l<=2000:
        p2.append(i)
    elif l<=3000:
        p3.append(i)
    elif l<=4000:
        p4.append(i)
    elif l<=5000:
        p5.append(i)
    elif l<=6000:
        p6.append(i)
    elif l<=7000:
        p7.append(i)

In [0]:
params = {
    1 : {
        'pset' : p1,
        'batch_size' : 8,
        'max_steps' : 1000
    },
    2 : {
        'pset' : p2,
        'batch_size' : 8,
        'max_steps' : 2000
    },
    3 : {
        'pset' : p3,
        'batch_size' : 8,
        'max_steps' : 3000
    },
    4 : {
        'pset' : p4,
        'batch_size' : 8,
        'max_steps' : 4000
    },
    5 : {
        'pset' : p5,
        'batch_size' : 8,
        'max_steps' : 5000
    },
    6 : {
        'pset' : p6,
        'batch_size' : 8,
        'max_steps' : 6000
    },
    7 : {
        'pset' : p7,
        'batch_size' : 8,
        'max_steps' : 7000
    }
}

In [0]:
def data_and_model(values, labels, bin_id, params, mname):
    pset = params[bin_id]['pset']
    split = int(0.7*len(pset))
    train_pset, test_pset = pset[:split], pset[split:]
    
    train_data, train_labels = values[values['process_id'].isin(train_pset)], labels[labels['process_id'].isin(train_pset)]
    test_data, test_labels = values[values['process_id'].isin(test_pset)], labels[labels['process_id'].isin(test_pset)]
        
    train_gen = generator(train_data,train_pset,train_labels,params[bin_id]['batch_size'],params[bin_id]['max_steps'])
    test_gen = generator(test_data,test_pset,test_labels,params[bin_id]['batch_size'],params[bin_id]['max_steps'])
    keras_model = make_model((params[bin_id]['max_steps'],train_data.shape[1]-2))
    if TPU_WORKER != None:
        tf.keras.backend.clear_session()
        model = tf.contrib.tpu.keras_to_tpu_model(keras_model, 
                                                      strategy=tf.contrib.tpu.TPUDistributionStrategy(
                                                          tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
    else:
        model = keras_model
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),loss=ape,metrics=['mae',mape])
    model.fit_generator(train_gen,epochs=50,callbacks=[tf.keras.callbacks.EarlyStopping('loss',0.05,5,mode='min')])
    print(model.evaluate_generator(test_gen))
    model.save_weights('./'+mname+str(bin_id)+'.h5', overwrite=True)        

In [0]:
for k in range(1,8):
    data_and_model(tv,tl,k,params,model_name)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000, 4)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 1000, 32)          4736      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 8,689
Trainable params: 8,689
Non-trainable params: 0
_________________________________________________________________
INFO:ten

ValueError: ignored

In [0]:
! cp *.h5 gdrive/My\ Drive/DrivenData/Models/

# Testing

In [0]:
def complete_submission_format(tl,features,mname,sname):
    # open and normalize data
    ttv = pd.read_csv("gdrive/My Drive/DrivenData/Data/test_values.csv",usecols=features)
    cols_enc = ['phase','supply_pump','supply_pre_rinse','supply_caustic','return_caustic','supply_acid','return_acid','supply_clean_water','return_recovery_water','return_drain','object_low_level','tank_lsh_caustic','tank_lsh_clean_water','target_time_period']
    cols_norm = ['supply_flow', 'supply_pressure', 'return_temperature', 'return_conductivity', 'return_turbidity', 'return_flow', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water', 'tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid', 'tank_concentration_caustic', 'tank_concentration_acid']
    le = LabelEncoder()
    for c in cols_enc:
        ttv[c] = le.fit_transform(ttv[c])
    ttv[cols_norm] = (ttv[cols_norm]-ttv[cols_norm].mean())*(10)/ttv[cols_norm].std()
    
    pp_set = ttv['process_id'].unique()
    pplens = []
    for i in pp_set:
        pplens.append((i,ttv[ttv['process_id']==i].shape[0]))

    pp1,pp2,pp3,pp4,pp5,pp6,pp7 = [],[],[],[],[],[],[]
    for i,l in pplens:
        if l<=1000:
            pp1.append(i)
        elif l<=2000:
            pp2.append(i)
        elif l<=3000:
            pp3.append(i)
        elif l<=4000:
            pp4.append(i)
        elif l<=5000:
            pp5.append(i)
        elif l<=6000:
            pp6.append(i)
        elif l<=7000:
            pp7.append(i)
            
    params2 = {
        1 : {
            'pset' : pp1,
            'batch_size' : 10,
            'max_steps' : 1000
        },
        2 : {
            'pset' : pp2,
            'batch_size' : 10,
            'max_steps' : 2000
        },
        3 : {
            'pset' : pp3,
            'batch_size' : 10,
            'max_steps' : 3000
        },
        4 : {
            'pset' : pp4,
            'batch_size' : 10,
            'max_steps' : 4000
        },
        5 : {
            'pset' : pp5,
            'batch_size' : 10,
            'max_steps' : 5000
        },
        6 : {
            'pset' : pp6,
            'batch_size' : 10,
            'max_steps' : 6000
        },
        7 : {
            'pset' : pp7,
            'batch_size' : 10,
            'max_steps' : 7000
        }
    }
    
    def data_and_model2(values, labels, bin_id, params, mname):
        pset = params[bin_id]['pset']

        test_data, test_labels = values[values['process_id'].isin(pset)], labels[labels['process_id'].isin(pset)]
        test_gen = generator(test_data,pset,test_labels,params[bin_id]['batch_size'],params[bin_id]['max_steps'])

        keras_model = make_model((params[bin_id]['max_steps'],test_data.shape[1]-1))
        if TPU_WORKER != None:
            tf.keras.backend.clear_session()
            model = tf.contrib.tpu.keras_to_tpu_model(keras_model, 
                                                          strategy=tf.contrib.tpu.TPUDistributionStrategy(
                                                              tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
        else:
            model = keras_model
        model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.01),loss=ape,metrics=['mae',mape])
        model.load_weights('./cr_model_'+str(bin_id)+'.h5')
        preds = model.predict_generator(test_gen)
        return np.reshape(preds,(-1,1))
    
    def get_submission():
        p = []
        for k in range(1,8):
            pset = np.reshape(params2[k]['pset'],(-1,1))
            pred = data_and_model2(ttv,tl,k,params2,'rec_model')
            p.extend(np.concatenate((pset,pred),axis=1).tolist())
        sub = pd.DataFrame(p,columns=['process_id','final_rinse_total_turbidity_liter'])
        sub = sub.sort_values(by=['process_id']).reset_index(drop=True)
        sub['process_id'] = sub['process_id'].astype(int)
        sub.to_csv("gdrive/My Drive/DrivenData/Submissions/"+sname+".csv",sep=',',index=None)
        return sub
    
    return get_submission()

In [0]:
sub = complete_submission_format(tl,features,'cr_model','sub2')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 1000, 31)          0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 499, 64)           6016      
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 249, 64)           0         
_________________________________________________________________
gru_16 (GRU)                 (None, 16)                3888      
_________________________________________________________________
dense_48 (Dense)             (None, 32)                544       
_________________________________________________________________
dense_49 (Dense)             (None, 8)                 264       
_________________________________________________________________
dense_50 (Dense)             (None, 1)                 9         
Total para

# NN


In [0]:
data = pd.read_csv("gdrive/My Drive/DrivenData/Data/train_meta_data.csv")

In [0]:
split = int(0.7*data.shape[0])
x_train, x_test = data.values[:split,1:-1], data.values[split:,1:-1]
y_train, y_test = data.values[:split,-1], data.values[split:,-1]

In [0]:
def model_gen(shape):
  inp = tf.keras.layers.Input(shape)
  x = tf.keras.layers.Dense(128, activation='relu')(inp)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  x = tf.keras.layers.Dense(128, activation='relu')(x)
  out = tf.keras.layers.Dense(1, activation='relu')(x)
  
  model = tf.keras.models.Model(inputs=[inp], outputs=[out])
  return model

In [0]:
model = model_gen((x_train.shape[1],))
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001),loss='mse',metrics=['mae'])
model.fit(x_train,y_train,batch_size=32,epochs=1500,validation_data=(x_test,y_test))#,callbacks=[tf.keras.callbacks.EarlyStopping('loss',0.05,5,mode='min')])
print(model.evaluate(x_test,y_test))
model.save_weights('./model.h5', overwrite=True)

In [10]:
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test,y_pred))

34.64501972869286
