# Introduction: First Pass at the Rinse Over Run Competition

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../..')

# Options for pandas
pd.options.display.max_columns = 30
pd.options.display.max_rows = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks
cf.go_offline(connected=True)
cf.set_config_file(theme='pearl')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
meta = pd.read_csv('input/recipe_metadata.csv', index_col='process_id')
meta['sequence'] = meta.apply(lambda x: ''.join([str(i) for i in x.values]), axis=1).astype(str)

In [3]:
train_ids = list(pd.read_csv('input/train_values.csv', usecols=['process_id'])['process_id'].unique())
test_ids = list(pd.read_csv('input/test_values.csv', usecols=['process_id'])['process_id'].unique())

train_meta = meta[meta.index.isin(train_ids)]
test_meta = meta[meta.index.isin(test_ids)]

train_meta.shape
test_meta.shape

(5021, 6)

(2967, 6)

In [4]:
train_meta.head()

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20001,1,1,1,1,1,11111
20002,1,1,0,0,1,11001
20003,1,1,1,1,1,11111
20004,1,1,1,1,1,11111
20005,1,0,0,1,1,10011


In [5]:
train_meta['sequence'].value_counts()
test_meta['sequence'].value_counts()

11111    3771
11001    1057
10011     193
Name: sequence, dtype: int64

11111    2258
11001     587
10011     122
Name: sequence, dtype: int64

In [6]:
train_phases = pd.read_csv('input/train_values.csv', index_col='process_id', usecols=['process_id', 'phase'])
train_phases = train_phases.query('phase != "final_rinse"')
test_phases = pd.read_csv('input/test_values.csv', index_col='process_id', usecols=['process_id', 'phase'])

train_process_phases = train_phases.groupby('process_id')['phase'].unique()
test_process_phases = test_phases.groupby('process_id')['phase'].unique()


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [7]:
train_meta['phases'] = train_process_phases.to_frame()['phase'].astype(str)
test_meta['phases'] = test_process_phases.to_frame()['phase'].astype(str)

test_meta['phases'].value_counts()
train_meta['phases'].value_counts()

['pre_rinse' 'caustic']                                1182
['pre_rinse' 'caustic' 'intermediate_rinse' 'acid']     671
['pre_rinse' 'caustic' 'intermediate_rinse']            670
['pre_rinse']                                           292
['acid']                                                122
['caustic']                                              23
['caustic' 'intermediate_rinse' 'acid']                   5
['caustic' 'intermediate_rinse']                          2
Name: phases, dtype: int64

['pre_rinse' 'caustic' 'intermediate_rinse' 'acid']    3726
['pre_rinse' 'caustic']                                1017
['acid']                                                199
['caustic']                                              38
['caustic' 'intermediate_rinse' 'acid']                  22
['pre_rinse']                                             3
Name: phases, dtype: int64

In [8]:
tpid = test_meta.index[1]
test_meta[test_meta.index == tpid]

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20006,1,1,0,0,1,11001,['pre_rinse' 'caustic']


In [9]:
train_meta[(train_meta['phases'] == "['pre_rinse' 'caustic']") & (train_meta['sequence'] == '11001')].head()

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20002,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20018,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20043,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20053,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20076,1,1,0,0,1,11001,['pre_rinse' 'caustic']


In [10]:
test_values = pd.read_csv('input/test_values.csv', nrows=10000, index_col='process_id')
test_process = test_values[test_values.index == tpid]

In [11]:
pid = 20002

In [12]:
train_values = pd.read_csv('input/train_values.csv', nrows=10000, index_col='process_id')
train_process = train_values[train_values.index == pid]

In [11]:
def summarize_process(pid, set_='test'):
    if set_ == 'test':
        values = test_values.copy()
    else:
        values = train_values.copy()
        
    process = values[values.index == pid].copy()
    summary = process.drop(columns=['row_id', 'object_id', 'timestamp']).groupby('phase').describe()
    summary.columns = summary.columns.map('_'.join)

    r = []
    for phase, row in summary.iterrows():
        row.index = [f'{phase}-{i}' for i in row.index]
        r.append(row)

    summary = pd.concat(r).to_frame().transpose()
    
    r = []
    for phase, phase_bools in process.select_dtypes(bool).drop(columns=['target_time_period']).groupby(process['phase']):
        phase_bool_counts = phase_bools.apply(lambda x: x.value_counts(), axis=0)
        for truth_value, row in phase_bool_counts.iterrows():
            row.index = [f'{phase}-{truth_value}_{i}' for i in row.index]
            r.append(row)

    bool_summary = pd.concat(r).to_frame().transpose().fillna(0)
    complete_summary = pd.concat([summary, bool_summary], axis=1)
    complete_summary.index = [pid]
    complete_summary.index.name = 'process_id'
    return complete_summary

In [12]:
test_values = pd.read_csv('input/test_values.csv', index_col='process_id')

In [15]:
tpid_summary = summarize_process(tpid, set_='test')

In [16]:
pid_summary = summarize_process(pid, set_ = 'train')

In [17]:
tpid_meta = test_meta[test_meta.index == tpid]
tpid_meta

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20006,1,1,0,0,1,11001,['pre_rinse' 'caustic']


In [18]:
pid_meta = train_meta[train_meta.index == pid]
pid_meta

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20002,1,1,0,0,1,11001,['pre_rinse' 'caustic']


In [20]:
pid_subset = pid_summary[tpid_summary.columns]
pid_subset

Unnamed: 0_level_0,caustic-return_conductivity_count,caustic-return_conductivity_mean,caustic-return_conductivity_std,caustic-return_conductivity_min,caustic-return_conductivity_25%,caustic-return_conductivity_50%,caustic-return_conductivity_75%,caustic-return_conductivity_max,caustic-return_flow_count,caustic-return_flow_mean,caustic-return_flow_std,caustic-return_flow_min,caustic-return_flow_25%,caustic-return_flow_50%,caustic-return_flow_75%,...,pre_rinse-False_object_low_level,pre_rinse-False_tank_lsh_caustic,pre_rinse-False_tank_lsh_clean_water,pre_rinse-True_supply_pump,pre_rinse-True_supply_pre_rinse,pre_rinse-True_supply_caustic,pre_rinse-True_return_caustic,pre_rinse-True_supply_acid,pre_rinse-True_return_acid,pre_rinse-True_supply_clean_water,pre_rinse-True_return_recovery_water,pre_rinse-True_return_drain,pre_rinse-True_object_low_level,pre_rinse-True_tank_lsh_caustic,pre_rinse-True_tank_lsh_clean_water
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
20002,421.0,32.642272,16.643543,0.172301,39.999832,41.032696,41.64694,46.42518,421.0,28224.546263,9780.208628,1189.9595,30779.803,31289.785,31749.133,...,7.0,25.0,25.0,25.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,18.0,0.0,0.0


In [21]:
train_to_consider = train_meta[train_meta['sequence'] == tpid_meta['sequence'].iloc[0]]
train_to_consider

Unnamed: 0_level_0,pre_rinse,caustic,intermediate_rinse,acid,final_rinse,sequence,phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20002,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20018,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20043,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20053,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20076,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20088,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20100,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20111,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20113,1,1,0,0,1,11001,['pre_rinse' 'caustic']
20123,1,1,0,0,1,11001,['pre_rinse' 'caustic']


In [22]:
def find_closest(tpid):
    tpid_data = test_summaries[tpid]
    tpid_meta = test_meta[test_meta.index == tpid].copy()
    train_ids_to_consider = list(train_meta[train_meta['sequence'] == tpid_meta['sequence']].index)
    
    train_data = []
    for ids in train_ids_to_consider:
        train_data_candidate = train_summaries[ids]
        try:
            train_data.append(train_data_candidate[tpid_data.columns])
        except: 
            pass
    return train_data

In [14]:
def summarize_process(pid, set_='test'):
    if set_ == 'test':
        values = test_values.copy()
    else:
        values = train_values.copy()
        
    process = values[values.index == pid].copy()
    summary = process.drop(columns=['row_id', 'object_id', 'timestamp']).groupby('phase').describe()
    summary.columns = summary.columns.map('_'.join)

    r = []
    for phase, row in summary.iterrows():
        row.index = [f'{phase}-{i}' for i in row.index]
        r.append(row)

    summary = pd.concat(r).to_frame().transpose()
    summary.index = [pid]

    r = []
    for phase, phase_bools in process.select_dtypes(bool).drop(columns=['target_time_period']).groupby(process['phase']):
        phase_bool_counts = phase_bools.apply(lambda x: x.value_counts(), axis=0)
        for truth_value, row in phase_bool_counts.iterrows():
            row.index = [f'{phase}-{truth_value}_{i}' for i in row.index]
            r.append(row)

    bool_summary = pd.concat(r).to_frame().transpose().fillna(0)
    bool_summary.index = [pid]
    complete_summary = pd.concat([summary, bool_summary], axis=1)
    return complete_summary

In [None]:
from tqdm import tqdm_notebook
from multiprocessing.pool import ThreadPool as Pool
from functools import partial

partial_func = partial(summarize_process, set_='test')

pool= Pool(processes=4)
test_summaries = {}

for i, r in tqdm_notebook(enumerate(pool.imap_unordered(partial_func, test_ids)), 
                          total=len(test_ids)):
    test_summaries[r.index[0]] = r
    
pool.close()
pool.join()

HBox(children=(IntProgress(value=0, max=2967), HTML(value='')))

In [None]:
import pickle 

with open('out/test_summaries.pkl', 'wb') as fout:
    pickle.dump(test_summaries, fout)

In [63]:
with open('out/test_summaries.pkl', 'rb') as fin:
    test_summaries = pickle.loads(fin.read())

In [54]:
test_meta['summary_length'] = [len(s.columns) for s in test_summaries.values()]
test_meta['summary_length'].value_counts()

320    1183
480     675
640     671
160     437
296       1
Name: summary_length, dtype: int64

In [55]:
import sys
sys.getsizeof(test_summaries) / 1e9

0.00014756

In [56]:
sys.getsizeof(test_values) / 1e9

0.700860068

In [57]:
del test_values, test_summaries

In [58]:
train_values = pd.read_csv('input/train_values.csv', index_col='process_id')

In [59]:
partial_func = partial(summarize_process, set_='train')

pool= Pool(processes=4)
train_summaries = {}

for i, r in tqdm_notebook(enumerate(pool.imap_unordered(partial_func, train_ids)), 
                          total=len(train_ids)):
    train_summaries[r.index[0]] = r
    
pool.close()
pool.join()

HBox(children=(IntProgress(value=0, max=5021), HTML(value='')))




In [60]:
with open('out/train_summaries.pkl', 'wb') as fout:
    pickle.dump(train_summaries, fout)

In [65]:
def find_closest(tpid):
    tpid_data = test_summaries[tpid]
    tpid_meta = test_meta[test_meta.index == tpid].copy()
    train_ids_to_consider = list(train_meta[train_meta['sequence'] == tpid_meta['sequence'].iloc[0]].index)
    
    train_data = []
    for ids in train_ids_to_consider:
        train_data_candidate = train_summaries[ids]
        try:
            train_data.append(train_data_candidate[tpid_data.columns])
        except: 
            pass
    return train_data

In [66]:
find_closest(test_ids[10])

[       caustic-return_conductivity_count  caustic-return_conductivity_mean  \
 20001                              728.0                         41.735117   
 
        caustic-return_conductivity_std  caustic-return_conductivity_min  \
 20001                         9.352762                         1.369625   
 
        caustic-return_conductivity_25%  caustic-return_conductivity_50%  \
 20001                        43.572252                        43.952608   
 
        caustic-return_conductivity_75%  caustic-return_conductivity_max  \
 20001                        44.379388                         50.42736   
 
        caustic-return_flow_count  caustic-return_flow_mean  \
 20001                      728.0              50242.729548   
 
        caustic-return_flow_std  caustic-return_flow_min  \
 20001              9385.368359                15042.679   
 
        caustic-return_flow_25%  caustic-return_flow_50%  \
 20001               53880.0255                54515.699   
 
      

In [67]:
find_closest(test_ids[100])

[       pre_rinse-return_conductivity_count  \
 20001                                265.0   
 
        pre_rinse-return_conductivity_mean  pre_rinse-return_conductivity_std  \
 20001                            1.564913                           0.472031   
 
        pre_rinse-return_conductivity_min  pre_rinse-return_conductivity_25%  \
 20001                           0.255486                           1.540524   
 
        pre_rinse-return_conductivity_50%  pre_rinse-return_conductivity_75%  \
 20001                           1.642529                           1.675136   
 
        pre_rinse-return_conductivity_max  pre_rinse-return_flow_count  \
 20001                           4.990765                        265.0   
 
        pre_rinse-return_flow_mean  pre_rinse-return_flow_std  \
 20001                 44845.44243               14520.246458   
 
        pre_rinse-return_flow_min  pre_rinse-return_flow_25%  \
 20001                  4918.9814                  31966.146   
 
    

In [69]:
df = pd.concat(_66)

In [None]:
train_values = pd.read_csv('input/train_values.csv', index_col='process_id', parse_dates=['timestamp'],
                           nrows=1000)
train_values.head()

In [None]:
train_pipelines = pd.read_csv('input/train_values.csv', usecols=['pipeline', 'process_id'], index_col='process_id')
train_pipelines.groupby('process_id')['pipeline'].value_counts()

In [None]:
train_pipelines['pipeline'].value_counts(normalize=True).sort_index().iplot(kind='bar', title='Training Pipelines')

In [None]:
test_pipelines = pd.read_csv('input/test_values.csv', usecols=['pipeline', 'process_id'], index_col='process_id')
test_pipelines['pipeline'].value_counts(normalize=True).sort_index().iplot(kind='bar', title='Test Pipelines')

In [None]:
train_timestamps = pd.read_csv('input/train_values.csv', usecols=['timestamp', 'process_id'], 
                              parse_dates=['timestamp'], index_col='process_id')

In [None]:
unique_times = train_timestamps.groupby('process_id').apply(lambda x: x['timestamp'].diff(1).dt.seconds.unique())


In [None]:
diffs = []

for i in list(unique_times.to_frame()[0]):
    diffs.append((list(i)[0], list(i)[1]))

In [None]:
unique_times.to_frame()[0].astype(str).unique()