In [1]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import scipy

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.set_config_file(world_readable=True, theme="pearl")
cf.go_offline(connected=True)

# Extra options
pd.options.display.max_rows = 10
pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [4]:
import pickle

with open('out/test_summaries.pkl', 'rb') as fin:
    tests = pickle.loads(fin.read())
    
len(tests)

2967

In [5]:
train_meta = pd.read_csv('out/train_meta.csv', index_col='process_id')
test_meta = pd.read_csv('out/test_meta.csv',index_col='process_id')

In [6]:
train_meta.head()

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20001,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse' 'a...
20002,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20003,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse' 'a...
20004,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse' 'a...
20005,1,0,0,1,1,10011,['acid']


In [7]:
test_meta.head()

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases,summary_length
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20000,1,1,1,1,1,11111,['pre_rinse' 'caustic'],320
20006,1,1,0,0,1,11001,['pre_rinse' 'caustic'],320
20007,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse'],480
20009,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse'],480
20010,1,1,1,1,1,11111,['pre_rinse' 'caustic' 'intermediate_rinse' 'a...,480


In [15]:
one_0011 = pd.read_csv('out/10011.csv', index_col=0)
one_1001 = pd.read_csv('out/11001.csv', index_col=0)
one_1111 = pd.read_csv('out/11111.csv', index_col=0)

In [16]:
one_0011.head()

Unnamed: 0,acid-return_conductivity_count,acid-return_conductivity_mean,acid-return_conductivity_std,acid-return_conductivity_min,acid-return_conductivity_25%,acid-return_conductivity_50%,acid-return_conductivity_75%,acid-return_conductivity_max,acid-return_flow_count,acid-return_flow_mean,acid-return_flow_std,acid-return_flow_min,...,final_rinse-True_supply_pre_rinse,final_rinse-True_supply_caustic,final_rinse-True_return_caustic,final_rinse-True_supply_acid,final_rinse-True_return_acid,final_rinse-True_supply_clean_water,final_rinse-True_return_recovery_water,final_rinse-True_return_drain,final_rinse-True_object_low_level,final_rinse-True_tank_lsh_caustic,final_rinse-True_tank_lsh_clean_water,target
20005,254.0,33.776595,18.33952,0.297533,40.264283,43.683186,44.301761,50.747,254.0,28372.35871,13033.332673,-18.08449,...,0.0,0.0,0.0,0.0,71.0,99.0,34.0,15.0,70.0,47.0,0.0,413310.650375
20047,114.0,15.063782,23.159147,0.117014,0.278325,0.363938,45.62906,65.09259,114.0,19727.292101,16258.759467,-618.4896,...,0.0,0.0,0.0,0.0,81.0,95.0,0.0,41.0,57.0,120.0,0.0,326358.148965
20109,459.0,22.409426,21.183829,0.108656,0.519101,18.555134,44.532752,46.611557,459.0,12469.36666,10869.653439,-618.4896,...,0.0,0.0,0.0,0.0,168.0,225.0,104.0,0.0,76.0,0.0,0.0,215801.843439
20130,527.0,21.036468,21.14844,0.139566,0.513265,4.999386,44.347447,46.914692,527.0,10633.522741,10852.412703,-618.4896,...,0.0,0.0,0.0,0.0,167.0,224.0,96.0,0.0,49.0,0.0,0.0,170105.269992
20182,250.0,33.11055,19.319852,0.153325,2.41187,43.76483,44.59359,54.11838,250.0,26752.105149,12496.562448,-614.8727,...,0.0,0.0,0.0,0.0,67.0,108.0,63.0,0.0,67.0,0.0,0.0,383719.420658


In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
scaler = MinMaxScaler(feature_range=(0, 1))
model = KNeighborsRegressor(n_neighbors=5)

In [20]:
def make_prediction(test_id):
    test_sequence = str(test_meta.loc[test_meta.index == test_id, 'sequence'].iloc[0])
    test = tests[test_id]
    
    if test_sequence == '10011':
        data = one_0011.copy()
    elif test_sequence == '11001':
        data = one_1001.copy()
    elif test_sequence == '11111':
        data = one_1111.copy()
    
    train = data[test.columns].copy()
    idx_to_drop = train.index[train.isna().sum(axis=1) > 0]
    train = train.drop(idx_to_drop, axis=0)
    targets = data.drop(idx_to_drop, axis=0)['target']
    X_train = scaler.fit_transform(train)
    X_test = scaler.transform(test)
    
    model.fit(X_train, targets)
    prediction = model.predict(X_test)
    return prediction

In [None]:
make_prediction(test_meta.index[10])

[autoreload of IPython.core.ultratb failed: Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 244, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 376, in superreload
    module = reload(module)
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/usr/local/Cellar/python/3.6.5_1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/usr/local/lib/python3.6/site-packages/IPython/core/ultratb.py", line 128, in <modul