In [4]:
import datashader as ds, numpy as np, holoviews as hv
from holoviews.operation.datashader import datashade

import datashader as ds
import datashader.transfer_functions as tf
import numpy as np
import pandas as pd
from colorcet import fire
import matplotlib.pyplot as plt
import datashader.utils as utils
from tqdm import trange
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import re
from IPython.display import Image
from IPython.core.display import HTML

# Preprocessing & Features

In [2]:
train_df = pd.read_csv('data/train.csv', index_col=0).drop(columns=['targetName'])

In [3]:
train_df.head(3)

Unnamed: 0,Time_0,posX_0,posY_0,posZ_0,velX_0,velY_0,velZ_0,Time_1,posX_1,posY_1,...,velY_28,velZ_28,Time_29,posX_29,posY_29,posZ_29,velX_29,velY_29,velZ_29,class
0,0,0,0,476.575673,486.926974,3.05709,305.119216,0.5,241.974495,-3.247714,...,,,,,,,,,,3
1,0,0,0,8073.292719,561.240579,0.939582,16.840572,0.5,284.446308,1.729876,...,,,,,,,,,,14
2,0,0,0,7804.597004,438.284572,-0.340086,-13.597957,0.5,217.255563,-2.342927,...,,,,,,,,,,21


In [5]:
test_df = pd.read_csv('data/test.csv', index_col=0)
test_df['class'] = pd.np.nan
merge = pd.concat([train_df,test_df],ignore_index=True)

##### Credit to `Klil_DataHack` group for great Features 

In [7]:

def cols(string, df):
    c = []
    for col in df.columns:
        if string in col:
            c.append(col)
    return c
# air density at height z, from http://www.dtic.mil/dtic/tr/fulltext/u2/a278141.pdf
def rho(z):
    rho_0 = 1.225 # Kg/m^3
    T0 = 288 # K
    Tz = T0 - 0.0065*z # K
    n = 5.2561
    return rho_0 * (Tz/T0)**n


def getXy(df):
    X = df.drop('class', axis=1).values
    y = df['class'].values
    return X, y

In [8]:
#add more features,and compute 
def compute_features(df):
    posX = cols('posX', df)
    posZ = cols('posZ', df)
#     posY = cols('posY', df)
#     velY = cols('velY', df)
    velX = cols('velX', df)
    velZ = cols('velZ', df)

#     accX = df[velX].diff(axis=1).rename(columns=lambda x: x.replace('velX','accX'))
#     accZ = df[velZ].diff(axis=1).rename(columns=lambda x: x.replace('velZ','accZ'))
    
    df['len'] = np.sum(~np.isnan(df[posX]), axis=1)
    for i,j in zip(velZ,velX):
        name = i.replace('velZ','angV')
        df[name] = np.arctan(df[i] / df[j])
    
    for i,j in zip(velX,velX[1:]):
        name = i.replace('velX','accX')
        df[str(name)] = (df[i]-df[j])/0.5
    df['accX_29'] = pd.np.nan

    for i,j in zip(velZ,velZ[1:]):
        name = i.replace('velZ','accZ')
        df[str(name)] = (df[i]-df[j])/0.5
    df['accZ_29'] = pd.np.nan
    
    for xv,zv in zip(velX,velZ):
        name = xv.replace('velX','velSize')
        df[str(name)] =np.sqrt(df[xv].values **2+df[zv].values**2)        
        
    accX = cols('accX', df)
    accZ = cols('accZ', df)
    velSize = cols('velSize', df)
    df = df.drop(cols('Time', df), axis=1)


    for x_acc,vel_size,vel_X,postion_z in zip(accX,velSize,velX,posZ):
        name = x_acc.replace('accX','dragX')
        df[str(name)] = -1.0 *df[x_acc]/(df[vel_size]*df[vel_X])* rho(df[postion_z])
    
    for x_acc,vel_size,vel_Z,postion_z in zip(accX,velSize,velZ,posZ):
        name = x_acc.replace('accX','dragZ')
        df[str(name)] = -1.0 *df[x_acc]/(df[vel_size]*df[vel_Z])* rho(df[postion_z])
    
    dragX = cols('dragX', df)
    dragZ = cols('dragZ', df)
    
    #https://en.wikipedia.org/wiki/Model_rocket_motor_classification
    #for better result compute the integral...
    for i,x_drag in enumerate(dragX):
        name = x_drag.replace('dragX','impulseX')
        df[str(name)] = df[x_drag]*(i+1)
    
    df['pXskew'] = df[posX].skew(axis=1)
    df['pZskew'] = df[posZ].skew(axis=1)
    df['vXskew'] = df[velX].skew(axis=1)
    df['vZskew'] = df[velZ].skew(axis=1)
    df['accXskew'] = df[accX].skew(axis=1)
    df['accZskew'] = df[accZ].skew(axis=1)
    df['velSizeskew'] = df[velSize].skew(axis=1)
    
    df['pXkurt'] = df[posX].kurt(axis=1)
    df['pZkurt'] = df[posZ].kurt(axis=1)
    df['vXkurt'] = df[velX].kurt(axis=1)
    df['vZkurt'] = df[velZ].kurt(axis=1)
    df['accXkurt'] = df[accX].kurt(axis=1)
    df['accZkurt'] = df[accZ].kurt(axis=1)
    df['velSizeZkurt'] = df[velSize].kurt(axis=1)
    
    # average position
    df['avgZ'] = np.nanmean(df[posZ], axis=1)
    df['avgX'] = np.nanmean(df[posX], axis=1)
    # average velocity
    df['avgVx'] = np.nanmean(df[velX], axis=1)
    df['avgVz'] = np.nanmean(df[velZ], axis=1)
    # average acceleration
    df['avgXacc'] = np.nanmean(df[velX].diff(axis=1), axis=1)
    df['avgZacc'] = np.nanmean(df[velX].diff(axis=1), axis=1)
    
    df['avgVelSize'] = np.nanmean(df[velSize].diff(axis=1), axis=1)
    
    df['avgV2'] = df['avgVx']**2 + df['avgVz']**2
    df['avgV'] = np.sqrt(df['avgV2'])
    df['avgDragX'] = -1.0 * df['avgXacc'] / (df['avgV'] * df['avgVx'] * rho(df['avgZ']))
    
    df['avgDragZ'] = -1.0 * ((df['avgXacc']-9.8) / (df['avgV'] * df['avgVz'] * rho(df['avgZ']))) 
    
    df['minImpulseX'] = np.nanmin(df[cols('impulseX', df)], axis=1)
    df['maxImpulseX'] = np.nanmax(df[cols('impulseX', df)], axis=1) 

    #https://en.wikipedia.org/wiki/Model_rocket_motor_classification
    df['avgImpulseX'] = df['avgDragX'] *df['len']
    df['angleV'] = np.arctan(df['avgVz'] / df['avgVx'])
    return df

In [9]:
df_mod = compute_features(merge)



In [11]:
df_mod.to_csv('data/modified_data.csv')

In [None]:
pd.reset_option('display.max_rows')

# Normalize the Data

In [20]:
def get_order_cols(name):
    return [name+'_'+str(i) for i in range(0,30)]

In [21]:
def normalize_data(df):
    new_df = []
    for name in ['posX','posZ','posY','velY','velX','velZ','accX','accZ','velSize','dragX','dragZ','impulseX','angV']:
        time_data = get_order_cols(name)
        raw_data = df[time_data].as_matrix().reshape(-1)
        
        mu = np.nanmean(raw_data)
        std = np.nanstd(raw_data)
        new_df.append((df[time_data]-mu)/std)

        print('name:{} mu:{} std:{}'.format(name,mu,std))
        
    temp = pd.concat(new_df,axis=1)
    
    static_features = ['pXskew','pZskew','vXskew','vZskew','accXskew','accZskew','velSizeskew','pXkurt','pZkurt',
    'vXkurt','vZkurt','accXkurt','accZkurt','velSizeZkurt','avgZ','avgX','avgVx',
    'avgVz','avgXacc','avgZacc','avgVelSize','avgV2','avgV','avgDragX','avgDragZ','angleV','avgImpulseX','maxImpulseX','minImpulseX']
    
    for name in static_features:
        mu = df[name].mean()
        std = df[name].std()
        print('name:{} mu:{} std:{}'.format(name,mu,std))
        temp[name] =(df[name]-mu)/std
        
    temp['len'] = df['len']
    temp['class'] = df['class']
    return temp
    

In [22]:
normalize_df = normalize_data(df_mod)

  """


name:posX mu:1959.857059069127 std:1612.7311947952528
name:posZ mu:8746.982647340057 std:6611.9438352509105
name:posY mu:-0.5135357017548835 std:3.096615199320577
name:velY mu:-0.1388993505759974 std:3.007073816966318
name:velX mu:371.1678331341431 std:146.41070597125903
name:velZ mu:-3.1203389996413646 std:252.19382229834136
name:accX mu:2.5135086692114634 std:19.79364243785036
name:accZ mu:9.094940080442283 std:17.931440009062733
name:velSize mu:442.40154947779223 std:164.602505196813
name:dragX mu:-9.389901727716647e-07 std:0.00045197096837698954
name:dragZ mu:-0.00018667230799331868 std:0.21601627903715392
name:impulseX mu:-0.00014725348797278525 std:0.0016876929607747284
name:angV mu:-0.0742121037017945 std:0.5666106280766035
name:pXskew mu:-0.031512140294995224 std:0.04733941054141761
name:pZskew mu:-0.32506796071927274 std:0.3161225418134059
name:vXskew mu:-0.0016692171641878827 std:0.5190223216285287
name:vZskew mu:0.0545187888394429 std:0.26312307778873006
name:accXskew mu:-0.

In [24]:
normalize_df.sample(10)

Unnamed: 0,posX_0,posX_1,posX_2,posX_3,posX_4,posX_5,posX_6,posX_7,posX_8,posX_9,...,avgV2,avgV,avgDragX,avgDragZ,angleV,avgImpulseX,maxImpulseX,minImpulseX,len,class
75060,-1.215241,-1.118903,-1.025432,-0.927736,-0.832192,-0.739113,-0.645283,-0.553589,-0.45786,-0.361275,...,-0.660467,-0.691214,-0.001826,0.001826,-0.715591,-0.001826,0.250129,-0.308124,27,
68927,-1.215241,-1.108823,-0.995496,-0.885049,-0.775688,-0.660679,-0.553787,-0.445339,-0.333062,-0.223462,...,0.959939,1.103487,-0.001826,0.001826,-1.568147,-0.001826,-0.413619,0.298076,12,
271822,-1.215241,-1.139886,-1.067412,-0.995521,-0.921599,-0.848294,-0.776083,-0.70346,-0.628803,-0.558106,...,-0.420052,-0.338571,-0.001826,0.001826,1.755418,-0.001826,-0.041237,0.166444,14,
144150,-1.215241,-1.071414,-0.925746,-0.781897,-0.633413,-0.487167,-0.344093,-0.203946,-0.056878,0.087963,...,0.479493,0.671365,-0.001826,0.001826,-0.874719,-0.001826,-0.415316,0.288596,12,
50749,-1.215241,-1.159694,-1.075794,-0.96341,-0.820122,-0.6565,-0.499976,-0.343474,-0.190274,-0.036389,...,2.074578,1.955204,-0.001826,0.001826,1.803548,-0.001826,1.326781,0.186793,28,
97489,-1.215241,-1.058238,-0.899009,-0.741633,-0.587054,-0.430072,-0.27403,-0.116687,0.037964,0.190776,...,0.870157,1.026506,-0.001826,0.001826,1.226669,-0.001826,-0.414761,0.297225,25,
56735,-1.215241,-1.143539,-1.070748,-1.000786,-0.929402,-0.854843,-0.78883,-0.718716,-0.651619,-0.581737,...,-0.994845,-1.320766,-0.001826,0.001826,-0.474517,-0.001826,0.776336,-1.02254,19,
173337,-1.215241,-0.933624,-0.661109,-0.398138,-0.14103,0.112367,0.3571,0.595027,0.831287,1.064007,...,3.877501,3.074388,-0.001826,0.001826,1.250453,-0.001826,-0.429144,0.21732,18,
208834,-1.215241,-1.044845,-0.88035,-0.709083,-0.54467,-0.378883,-0.208729,-0.046272,0.121149,0.284796,...,0.37893,0.573788,-0.001826,0.001826,-0.112988,-0.001826,-0.384792,0.256425,19,
141368,-1.215241,-1.080841,-0.943392,-0.809976,-0.675702,-0.539719,-0.407156,-0.269594,-0.136265,0.002408,...,-0.055463,0.11419,-0.001826,0.001826,0.720724,-0.001826,-0.428914,0.306517,14,


In [31]:
normalize_df.to_csv('data/normalize_df.csv')

In [32]:
normalize_df_train  = normalize_df[~pd.isna(normalize_df['class'])]

In [33]:
normalize_df_train.to_csv('data/normalize_df_train.csv')

In [34]:
normalize_df_test  = normalize_df[pd.isna(normalize_df['class'])]

In [None]:
normalize_df_test.to_csv('data/normalize_df_test.csv')

# Visualize all ~300,000 trajectories at once

In [5]:
normalize_df = pd.read_csv('data/normalize_df.csv')

In [2]:
def get_time_series(df, col_name, with_class=False, remove_name=False,class_number=None):
    if class_number is not None:
        new_df = df[df['class']==class_number]
        new_df = new_df.filter(like=col_name)
    else:
         new_df = df.filter(like=col_name)
    
    cols = list(new_df.columns)
    newcols = [c for c in cols if re.search('\d', c)]
    print(newcols)
    
    new_df = new_df[newcols]
    if remove_name:
        new_df = new_df.rename(columns=lambda x: x.split('_')[1])
    if with_class:
        new_df['class'] = df['class']

    return new_df

def plot_property(data_df,name,class_number=None):
   
    df = get_time_series(data_df,name, with_class=False, remove_name=True,class_number=class_number)

    data = df.as_matrix()        
    df = ds.utils.dataframe_from_multiple_sequences(np.arange(data.shape[-1]),data)

    hv.extension("bokeh")

    return datashade(hv.Curve(df)).options(width=900)



In [6]:
@interact(c=widgets.Dropdown(
    options=[None]+[i for i in range(1,26)],
#     value='2',
    description='class:',
    disabled=False,
)
,c2=widgets.Dropdown(
    options=['posX','posZ','posY','velY','velX','velZ','accX','accZ','velSize','dragX','dragZ','impulseX','angV'],
#     value='2',
    description='property:',
    disabled=False,
))
def g(c,c2):
    return plot_property(normalize_df,c2,class_number=c)

interactive(children=(Dropdown(description='class:', options=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…

# ML model

### Train the following model with ensemble learning (see the result in the next section).
### All models got F1-score of ~73% to~76%

In [None]:
from argparse import ArgumentParser
from os import listdir

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from ignite.engine import create_supervised_trainer, create_supervised_evaluator, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss
from torch.optim import Adam
from torch.utils.data import Dataset
from tqdm import tqdm
from os.path import isfile, join

is_cuda = torch.cuda.torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if is_cuda else torch.FloatTensor
DoubleTensor = torch.cuda.DoubleTensor if is_cuda else torch.DoubleTensor
LongTensor = torch.cuda.LongTensor if is_cuda else torch.LongTensor


class RocketData(Dataset):

    def __init__(self, df, train_mode=True, padding_size=30, eval_mode=False):
        super(RocketData, self, ).__init__()
        self.df = df.fillna(0)
        self.is_train = train_mode
        self.padding_size = padding_size
        self.statistical_features_names = ['pXskew', 'pZskew', 'vXskew', 'vZskew', 'accXskew', 'accZskew',
                                           'velSizeskew', 'pXkurt', 'pZkurt',
                                           'vXkurt', 'vZkurt', 'accXkurt', 'accZkurt', 'velSizeZkurt', 'avgZ', 'avgX',
                                           'avgVx',
                                           'avgVz', 'avgXacc', 'avgZacc', 'avgVelSize', 'avgV2', 'avgV', 'avgDragX',
                                           'avgDragZ', 'angleV',
                                           'avgImpulseX', 'maxImpulseX', 'minImpulseX']

        self.time_series_features_names = ['posX', 'posZ', 'posY', 'velY', 'velX', 'velZ', 'accX', 'accZ', 'velSize',
                                           'dragX', 'dragZ',
                                           'impulseX', 'angV']
        self.eval_mode = eval_mode

    def get_weights(self):
        count_class = self.df.groupby('class').count()['len'].values
        class_percent = (count_class / sum(count_class))
        reverse = 1 / class_percent
        normalize_reversed_percent_per_class = reverse / sum(reverse)
        percent_per_class_example = normalize_reversed_percent_per_class / count_class
        instances_weights = np.array([percent_per_class_example[int(i) - 1] for i in self.df['class'].values])
        return instances_weights

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        trajectory = self.df.iloc[idx, :]

        if not self.eval_mode:
            class_number = int(trajectory['class'])

        trajectory_len = int(trajectory['len'])

        final_features_vector = []

        for name in self.time_series_features_names:
            final_features_vector.append(self._get_series_data_by_name(trajectory, name))

        for name in self.statistical_features_names:
            temp = np.zeros(self.padding_size)
            temp[0:trajectory_len] = trajectory[name]
            final_features_vector.append(temp)

        stack = torch.from_numpy(np.stack(final_features_vector, axis=1)).float()

        if self.eval_mode:
            return stack
        return stack, torch.tensor(class_number - 1)

    def _get_series_data_by_name(self, row, name):
        return row.filter(items=self._get_all_columns(name)).values

    @staticmethod
    def _get_all_columns(name):
        return [name + '_' + str(i) for i in range(0, 30)]


class RegularModel(nn.Module):

    def __init__(self):
        super(RegularModel, self).__init__()

        self.LSTM = nn.LSTM(42, 64, num_layers=3, batch_first=True, bidirectional=True, dropout=0.1)
        self.fc1 = nn.Linear(128, 100)
        self.fc2 = nn.Linear(100, 60)
        self.fc3 = nn.Linear(60, 25)
        self.init_weights()

    def forward(self, x):
        self.LSTM.flatten_parameters()
        out, _ = self.LSTM(x)
        out = F.relu(self.fc1(out[:, -1]))
        out = F.relu(self.fc2(out))
        out = F.dropout(out, training=self.training, p=0.3)
        out = self.fc3(out)
        return F.log_softmax(out, dim=-1)

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()


def get_data_loaders_train(train_batch_size, df_train, val_batch_dize, df_test):
    train_dataset = RocketData(df=df_train)

    weights = train_dataset.get_weights()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

    val_dataset = RocketData(df=df_test)

    print(f'train on:{len(train_dataset)}, val on: {len(val_dataset)}')

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size,
                                               drop_last=False, pin_memory=False, sampler=sampler, num_workers=5)

    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch_dize, shuffle=False,
                                             drop_last=True, pin_memory=False, num_workers=2)
    return train_loader, val_loader


def random_split_train_data(train_csv):
    df = pd.read_csv(train_csv)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    msk = []
    for i in range(1, 26):
        msk += list(df[df['class'] == i].sample(110).index) # 110 is about 15% of the data.
    msk = pd.Int64Index(np.array(msk))
    msk_c = np.delete(np.arange(0, len(df)), msk)
    df_train85 = df.iloc[msk_c]
    df_train15 = df.iloc[msk]
    return df_train85, df_train15


def run(train_batch_size, val_batch_size, epochs, lr, log_interval, train_csv, save_folder, gpu_device):
    print('load train data...')
    df_train85, df_train15 = random_split_train_data(train_csv)
    print('done!')

    train_loader, val_loader = get_data_loaders_train(train_batch_size, df_train85, val_batch_size,
                                                      df_train15)
    device = 'cpu'
    if is_cuda:
        device = gpu_device

    model = RegularModel().to(device)

    optimizer = Adam(model.parameters(), lr=lr)

    trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device)

    evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'nll': Loss(F.nll_loss)},
                                            device=device)

    def score_function(engine):
        print(engine.state.accuracy)
        return engine.state.accuracy

    handler = ModelCheckpoint(f'{save_folder}', 'model', score_function=score_function, create_dir=True,
                              score_name='acc')

    desc = "ITERATION - loss: {:.2f}"

    pbar = tqdm(
        initial=0, leave=False, total=len(train_loader),
        desc=desc.format(0)
    )

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        print("\nTraining Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
              .format(trainer.state.epoch, metrics['accuracy'], metrics['nll']))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        # global best_acc
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        engine.state.accuracy = avg_accuracy
        tqdm.write("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}".format(engine.state.epoch,
                                                                                                  avg_accuracy,
                                                                                                  avg_nll))
        pbar.n = pbar.last_print_n = 0

    trainer.add_event_handler(Events.EPOCH_COMPLETED, handler, {'model': model})

    trainer.run(train_loader, max_epochs=epochs)

    pbar.close()


def test(model_idx, gpu, test_csv='data/normalize_df_test.csv', ):
    pred_data = RocketData(df=pd.read_csv(test_csv), eval_mode=True)
    pred_loader = torch.utils.data.DataLoader(pred_data, batch_size=1000, shuffle=False,
                                              drop_last=False, pin_memory=False, num_workers=3)

    size = len(pred_data)

    mypath = f'{model_idx}'
    print(mypath)
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    print(onlyfiles)
    model_name = onlyfiles[0]
    print(f'load model from {model_name}')

    model = torch.load(f'{mypath}/{model_name}')

    model.eval()

    all_pred = []

    pbar = tqdm(
        initial=0, leave=False, total=len(pred_loader),
    )

    for data in pred_loader:
        data = data.to(f'cuda:{gpu}')
        out = model(data)
        values, indices = torch.max(out, -1)
        all_pred += list(indices.data.cpu().numpy() + 1)
        pbar.update(1)
    pred = np.array(all_pred[:size])
    print(pred.shape)
    pred = pred.astype(int)
    df = pd.DataFrame(pred)
    out_name = f'submission_py_{model_idx}.csv'
    df.to_csv(out_name, header=False)


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=10)

    parser.add_argument('--val_batch_size', type=int, default=110)

    parser.add_argument('--train_csv', type=str, default='data/normalize_df_train.csv')

    parser.add_argument('--epochs', type=int, default=45)
    parser.add_argument('--log_interval', type=int, default=1)
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--save_folder', type=str, default='model1')
    parser.add_argument('--gpu', type=str, default='0')
   
    args = parser.parse_args()

    run(args.batch_size, args.val_batch_size, args.epochs, args.lr, args.log_interval, args.train_csv,
        args.save_folder, gpu_device=f'cuda:{args.gpu}')

    test(args.save_folder, args.gpu)


# Predictions

### predict 1 to 25

In [12]:
import glob
all_pred_files_name= glob.glob("all_pred_files/*.csv")
all_pred_df = [pd.read_csv(name,header=None).drop(0,axis=1) for name in all_pred_files_name]
merge_all_pred = pd.concat(all_pred_df,ignore_index=False,axis=1)

In [15]:
merge_all_pred.head(5)

Unnamed: 0,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,...,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19
0,21,21,21,21,21,21,21,21,21,11,...,21,21,21,21,21,21,21,21,21,21
1,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,21,21,21,21,21,21,21,21,21,21,...,21,21,21,21,21,21,21,21,21,21
3,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,8,8,8,9,8,8,9,9,9,8,...,8,9,9,8,9,8,8,8,9,8


Majority prediction by all classifier

In [14]:
pred_class = []
for i in trange(len(merge_all_pred)):
    pred_class.append(merge_all_pred.loc[i].value_counts().idxmax())

100%|██████████| 271251/271251 [07:42<00:00, 586.24it/s]


In [16]:
pred = pd.DataFrame(pred_class)

In [36]:
pred.head(15)

Unnamed: 0,0
0,21
1,2
2,21
3,2
4,9
5,6
6,21
7,12
8,15
9,21


#### Prediction on REAL-TEST 73%

In [79]:
Image(url= "https://cdn1.imggmi.com/uploads/2019/3/31/ac7c317cd5d4d502d1dd3e365e8e041f-full.png",width=400, height=400)

### Predict the 26 class by Heuristic

In [37]:
total =[]
for i in trange(len(merge)):
    counter = {c:0 for c in range(1,26)}
    for c,count in merge.loc[i].value_counts().iteritems():
        counter[c] = count/120
    total.append(counter)    

statistic_df = pd.DataFrame(total)


100%|██████████| 271251/271251 [06:26<00:00, 702.61it/s]


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.975000,0.000000,0.000000,0.000000,0.000000
1,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.483333,0.516667,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.300000,0.700000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.325000,0.675000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.083333,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.066667,0.833333,0.033333,0.000000,0.033333,0.033333


In [40]:
statistic_df['class']= pred[0]

In [51]:
indexs = []
partic_cs = np.array([1,7,8,9,11,17,20],dtype=np.int32)
coefficient = np.array([0.025,0.478,0.26,0.42,0.2,0.008,0.08]) # Heuristic

for i,c in enumerate(partic_cs):
    all_other = np.delete(partic_cs.copy(),np.where(partic_cs == c))
    class_filter = statistic_df[statistic_df['class']==int(c)]
    class_filter['similarity'] = sum([class_filter[i] for i in all_other])
    sim_data = class_filter.sort_values(by=['similarity'],ascending=False)
    match_df = sim_data[sim_data['similarity'] >= coefficient[i]]
    indexs+=match_df.index.tolist()
    print(f'class:{c} precent:{len(match_df)/len(class_filter)}, {len(match_df)} of {len(class_filter)}')
print(len(indexs))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


class:1 precent:0.11892335659414766, 2028 of 17053
class:7 precent:0.009618187117458467, 132 of 13724


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


class:8 precent:0.19073528300460968, 2524 of 13233
class:9 precent:0.0913426708524511, 1513 of 16564


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


class:11 precent:0.023365617433414042, 193 of 8260
class:17 precent:0.15830396019075266, 1527 of 9646
class:20 precent:0.13996252049660343, 1195 of 8538
9112


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [64]:
pred.loc[indexs,0] =int(26)

In [66]:
pred.head(15)

Unnamed: 0,0
0,21
1,2
2,21
3,2
4,26
5,6
6,21
7,12
8,15
9,21


In [68]:
import gzip

out_name = 'submission_py.csv'
pred.to_csv(out_name, header=False)

in_data = open(out_name, "rb").read()
out_gz = out_name + ".gz"
gzf = gzip.open(out_gz, "wb")
gzf.write(in_data)
gzf.close()


#### Prediction on REAL-TEST 75%

In [78]:
Image(url= "https://cdn1.imggmi.com/uploads/2019/3/31/69668821ee7651157dae6db7e93f46a1-full.png",width=400, height=400)