# Steel Plate Defect Prediction

In [37]:
#from google.colab import drive
#drive.mount('/content/drive')

In [38]:
#!pip install optuna
#!pip install catboost

In [1]:
%%time

import joblib

import numpy as np
np.random.seed(42)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import optuna

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

sns.set_theme()
sns.set_context("paper")

CPU times: total: 2.69 s
Wall time: 5.55 s


In [2]:
#original_data = pd.read_csv('data/train.csv',index_col=0)
train = pd.read_csv('data/train.csv',index_col=0)
test = pd.read_csv('data/test.csv',index_col=0)
sub = pd.read_csv('data/sample_submission.csv',index_col=0)
df_org = pd.read_csv('data/original_data.csv')

In [3]:
# Print the first few rows of each dataset
print("Train data:\n", train.shape, "\n\nTest data:\n", test.shape, "\n\nSubmission data:\n", sub.shape, "\n\nOriginal data: \n", df_org.shape)

Train data:
 (19219, 34) 

Test data:
 (12814, 27) 

Submission data:
 (12814, 7) 

Original data: 
 (1941, 34)


In [4]:
targets = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

In [5]:
train.head()

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [6]:
train.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [7]:
class Preprocess:

    def fit(self, X, y=None):
        return

    def fit_transform(self, X):
        epsilon = 1e-6

        # Location Features
        X['X_Distance'] = X['X_Maximum'] - X['X_Minimum']
        X['Y_Distance'] = X['Y_Maximum'] - X['Y_Minimum']

        # Density Feature
        X['Density'] = X['Pixels_Areas'] / (X['X_Perimeter'] + X['Y_Perimeter'])

        # Relative Perimeter Feature
        X['Relative_Perimeter'] = X['X_Perimeter'] / (X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)

        # Circularity Feature
        X['Circularity'] = X['Pixels_Areas'] / (X['X_Perimeter'] ** 2)

        # Symmetry Index Feature
        X['Symmetry_Index'] = np.abs(X['X_Distance'] - X['Y_Distance']) / (X['X_Distance'] + X['Y_Distance'] + epsilon)

        # Color Contrast Feature
        X['Color_Contrast'] = X['Maximum_of_Luminosity'] - X['Minimum_of_Luminosity']

        # Combined Geometric Index Feature
        X['Combined_Geometric_Index'] = X['Edges_Index'] * X['Square_Index']

        # Interaction Term Feature
        X['X_Distance*Pixels_Areas'] = X['X_Distance'] * X['Pixels_Areas']

        # Additional Features
        X['sin_orientation'] = np.sin(X['Orientation_Index'])
        X['Edges_Index2'] = np.exp(X['Edges_Index'] + epsilon)
        X['X_Maximum2'] = np.sin(X['X_Maximum'])
        X['Y_Minimum2'] = np.sin(X['Y_Minimum'])
        X['Aspect_Ratio_Pixels'] = np.where(X['Y_Perimeter'] == 0, 0, X['X_Perimeter'] / X['Y_Perimeter'])
        X['Aspect_Ratio'] = np.where(X['Y_Distance'] == 0, 0, X['X_Distance'] / X['Y_Distance'])

        # Average Luminosity Feature
        X['Average_Luminosity'] = (X['Sum_of_Luminosity'] + X['Minimum_of_Luminosity']) / 2

        # Normalized Steel Thickness Feature
        X['Normalized_Steel_Thickness'] = (X['Steel_Plate_Thickness'] - X['Steel_Plate_Thickness'].min()) / (X['Steel_Plate_Thickness'].max() - X['Steel_Plate_Thickness'].min())

        # Logarithmic Features
        X['Log_Perimeter'] = np.log(X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)
        X['Log_Luminosity'] = np.log(X['Sum_of_Luminosity'] + epsilon)
        X['Log_Aspect_Ratio'] = np.log(X['Aspect_Ratio'] ** 2 + epsilon)

        # Statistical Features
        X['Combined_Index'] = X['Orientation_Index'] * X['Luminosity_Index']
        X['Sigmoid_Areas'] = 1 / (1 + np.exp(-X['LogOfAreas'] + epsilon))

        return X


In [8]:
train = Preprocess().fit_transform(train)
train.head()

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Y_Minimum2,Aspect_Ratio_Pixels,Aspect_Ratio,Average_Luminosity,Normalized_Steel_Thickness,Log_Perimeter,Log_Luminosity,Log_Aspect_Ratio,Combined_Index,Sigmoid_Areas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.260085,1.6,1.2,1193.5,0.038462,2.564949,7.729296,0.364644,0.0052,0.769253
1,808,816,728350,728372,433,20,54,44478,70,111,...,-0.017598,0.37037,0.363636,22274.0,0.153846,4.304065,10.70275,-2.023194,-0.222347,0.933174
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.876624,1.678571,2.25,655710.0,0.0,7.025538,14.086599,1.621861,0.000991,0.982983
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,-0.997695,0.551724,0.296296,1658.0,0.0,3.806663,8.071531,-2.432779,-0.026801,0.910699
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.071318,1.074627,0.444444,24156.5,1.0,4.934474,10.783757,-1.621855,-0.224829,0.941


In [9]:
df_org = Preprocess().fit_transform(df_org)
df_org.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Y_Minimum2,Aspect_Ratio_Pixels,Aspect_Ratio,Average_Luminosity,Normalized_Steel_Thickness,Log_Perimeter,Log_Luminosity,Log_Aspect_Ratio,Combined_Index,Sigmoid_Areas
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.448853,0.386364,0.181818,12148.0,0.153846,4.110874,10.094934,-3.409466,-0.238342,0.918826
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,-0.907991,0.333333,0.206897,5740.5,0.153846,3.688879,9.341105,-3.151049,-0.139268,0.884259
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,-0.396659,0.421053,0.333333,4035.5,0.230769,3.295837,8.983691,-2.197216,-0.081871,0.86428
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.375881,0.288889,0.155556,9547.5,0.961538,4.060443,9.851984,-3.721463,-0.132402,0.904262
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,-0.458158,0.230769,0.066148,123483.5,0.557692,5.768321,12.41686,-5.431497,-0.186013,0.967131


In [10]:
X, y = train.drop(targets, axis=1), np.argmax(train[targets].values, axis=1)

X_org, y_org  = df_org.drop(targets, axis=1), np.argmax(df_org[targets].values, axis=1)

X = pd.concat([X, X_org], axis=0)
y = np.concatenate([y, y_org])

In [11]:
def auc_score(estimator, X, y):
    y_prob = estimator.predict_proba(X)
    return roc_auc_score(y, y_prob, multi_class="ovr")

In [12]:
# bests_xgb = {}

# splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42, train_size=0.25)

# for i, (train_index, test_index) in enumerate(splitter.split(X, y)):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     X_train = pd.concat([X_train, X_org], axis=0)
#     y_train = np.concatenate([y_train, y_org])


#     def objective(trial):
#         xgb_params = {
#             'num_leaves': trial.suggest_int('num_leaves', 20, 150),
#             'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
#             'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000, 20000),
#             'min_child_samples': trial.suggest_int('min_child_samples', 20, 500, 5),
#             'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
#             'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#             'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#             'max_depth': trial.suggest_int('max_depth', 3, 11),
#             'n_estimators': trial.suggest_int('n_estimators', 500, 3000, 50),
#             'boosting_type': trial.suggest_categorical('boosting_type', ['gbtree', 'dart']),
#         }

#         xgb = XGBClassifier(verbose=0, device='cuda', **xgb_params)
#         xgb.fit(X_train, y_train)

#         return auc_score(xgb, X_test, y_test)

#     study = optuna.create_study(direction='maximize')
#     study.optimize(objective, n_trials=50, show_progress_bar=True)

#     bests_xgb[f'Best for fold {i} with value {study.best_value} is'] = study.best_params
# print(bests_xgb)

In [13]:
xgb1_best = {
    'num_leaves': 134,
    'learning_rate': 0.010271606426992659,
    'subsample_for_bin': 40000,
    'min_child_samples': 380,
    'reg_alpha': 0.16917262652846554,
    'reg_lambda': 0.13888617284516097,
    'colsample_bytree': 0.697112299518055,
    'subsample': 0.6205738758196828,
    'max_depth': 9,
    'n_estimators': 1250,
    'boosting_type': 'gbtree'
}

xgb2_best =  {'num_leaves': 21,
  'learning_rate': 0.01405419332645159,
  'subsample_for_bin': 260000,
  'min_child_samples': 220,
  'reg_alpha': 0.7853731542518498,
  'reg_lambda': 0.1176672631224891,
  'colsample_bytree': 0.6055458132940692,
  'subsample': 0.7432388790668751,
  'max_depth': 8,
  'n_estimators': 650,
  'boosting_type': 'gbtree'
}

xgb3_best = {'num_leaves': 132,
  'learning_rate': 0.004485941158933727,
  'subsample_for_bin': 60000,
  'min_child_samples': 490,
  'reg_alpha': 0.053968155862780684,
  'reg_lambda': 0.6770412204123433,
  'colsample_bytree': 0.6545793821182855,
  'subsample': 0.8462518277570567,
  'max_depth': 11,
  'n_estimators': 3000,
  'boosting_type': 'dart'
}

xgb4_best = {'num_leaves': 110,
  'learning_rate': 0.008422424547788729,
  'subsample_for_bin': 200000,
  'min_child_samples': 210,
  'reg_alpha': 0.4805967717530679,
  'reg_lambda': 0.9590120909716267,
  'colsample_bytree': 0.7863140863745435,
  'subsample': 0.8539853134222098,
  'max_depth': 8,
  'n_estimators': 1500,
  'boosting_type': 'gbtree'
}

xgb5_best =  {'num_leaves': 125,
  'learning_rate': 0.004838137464390279,
  'subsample_for_bin': 60000,
  'min_child_samples': 200,
  'reg_alpha': 0.13649445902042046,
  'reg_lambda': 0.3992784440595847,
  'colsample_bytree': 0.6626288352384205,
  'subsample': 0.661157404297285,
  'max_depth': 11,
  'n_estimators': 1950,
  'boosting_type': 'gbtree'
}


xgb6_best =   {'num_leaves': 74,
  'learning_rate': 0.00960474533638333,
  'subsample_for_bin': 100000,
  'min_child_samples': 290,
  'reg_alpha': 0.36776220373266066,
  'reg_lambda': 0.17588834125558594,
  'colsample_bytree': 0.6666190016457321,
  'subsample': 0.807271653307728,
  'max_depth': 10,
  'n_estimators': 1400,
  'boosting_type': 'gbtree'
}

In [14]:
%%time

# fit the model
xgb1 = XGBClassifier(verbose=False, device='cuda', **xgb1_best)
xgb2 = XGBClassifier(verbose=False, device='cuda', **xgb2_best)
xgb3 = XGBClassifier(verbose=False, device='cuda', **xgb3_best)
xgb4 = XGBClassifier(verbose=False, device='cuda', **xgb4_best)
xgb5 = XGBClassifier(verbose=False, device='cuda', **xgb5_best)
xgb6 = XGBClassifier(verbose=False, device='cuda', **xgb6_best)
xgb7 = XGBClassifier(verbose=False, device='cuda')


voting = VotingClassifier(
    estimators = [
        ('xgb1', xgb1),
        ('xgb2', xgb2),
        ('xgb3', xgb3),
        ('xgb4', xgb4),
        ('xgb5', xgb5),
        ('xgb6', xgb6),
        ('xgb7', xgb7)
    ],
    voting='soft')

scores = cross_val_score(voting, X, y, scoring=auc_score, cv=5)

voting.fit(X, y)

CPU times: total: 1h 38min 18s
Wall time: 1h 35min 38s


In [25]:
# check feature importance in voting classifier
feature_importance = pd.DataFrame(voting.named_estimators_['xgb1'].feature_importances_, index=X.columns, columns=['xgb1']).sort_values('xgb1', ascending=False)
feature_importance['xgb2'] = voting.named_estimators_['xgb2'].feature_importances_
feature_importance['xgb3'] = voting.named_estimators_['xgb3'].feature_importances_
feature_importance['xgb4'] = voting.named_estimators_['xgb4'].feature_importances_
feature_importance['xgb5'] = voting.named_estimators_['xgb5'].feature_importances_
feature_importance['xgb6'] = voting.named_estimators_['xgb6'].feature_importances_
feature_importance['xgb7'] = voting.named_estimators_['xgb7'].feature_importances_
feature_importance['mean'] = feature_importance.mean(axis=1)
feature_importance = feature_importance.sort_values('mean', ascending=False)
feature_importance.head(20)

Unnamed: 0,xgb1,xgb2,xgb3,xgb4,xgb5,xgb6,xgb7,mean
Edges_X_Index,0.011278,0.090243,0.077294,0.108318,0.064309,0.079574,0.152229,0.083321
Outside_X_Index,0.024726,0.059097,0.065911,0.070632,0.059839,0.066103,0.101989,0.064043
X_Minimum,0.01322,0.055921,0.048291,0.068055,0.04545,0.048626,0.094891,0.053493
sin_orientation,0.023843,0.054973,0.061591,0.052232,0.055587,0.061105,0.031131,0.048638
Orientation_Index,0.023103,0.048336,0.051825,0.047127,0.048317,0.051536,0.056382,0.046661
Y_Minimum2,0.010481,0.051086,0.051764,0.076394,0.049267,0.052074,0.0,0.041581
Y_Minimum,0.010719,0.047286,0.048671,0.044624,0.046559,0.04826,0.0,0.03516
Steel_Plate_Thickness,0.045991,0.03199,0.025605,0.030454,0.026895,0.028855,0.042645,0.033205
Combined_Index,0.012669,0.041218,0.030013,0.029018,0.03615,0.031548,0.030125,0.030106
Length_of_Conveyer,0.027932,0.029988,0.028583,0.028953,0.027895,0.028896,0.035414,0.029666


In [16]:

joblib.dump(voting, 'models/voting_model.pkl')

['models/voting_model.pkl']

In [None]:
# #lightgbm

# bests_lgbm = {}

# splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

# for i, (train_index, test_index) in enumerate(splitter.split(X, y)):

#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     X_train = pd.concat([X_train, X_org], axis=0)
#     y_train = np.concatenate([y_train, y_org])

#     def objective(trial):
#         lgbm_params = {
#             'objective': 'binary',
#             'metric': 'auc',
#             'verbosity': -1,
#             'boosting_type': 'gbdt',
#             'num_leaves': trial.suggest_int('num_leaves', 10, 200),
#             'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
#             'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
#             'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
#             'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
#             'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#             # Add more parameters as needed
#         }

#         lgbm = LGBMClassifier(**lgbm_params)
#         lgbm.fit(X_train, y_train)
#         y_pred = lgbm.predict_proba(X_test)[:, 1]

#         return roc_auc_score(y_test, y_pred)

#     study = optuna.create_study(direction='maximize')
#     study.optimize(objective, n_trials=50, show_progress_bar=True)

#     bests_lgbm[f'Best for fold {i} with value {study.best_value} is'] = study.best_params

# print(bests_lgbm)


In [None]:
# #catboost

# bests_catboost = {}

# splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

# for i, (train_index, test_index) in enumerate(splitter.split(X, y)):

#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     X_train = pd.concat([X_train, X_org], axis=0)
#     y_train = np.concatenate([y_train, y_org])

#     def objective(trial):
#         catboost_params = {
#             'iterations': trial.suggest_int('iterations', 100, 1000),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#             'depth': trial.suggest_int('depth', 4, 10),
#             'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0),
#             'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#             'border_count': trial.suggest_int('border_count', 32, 255),
#             # Add more parameters as needed
#             'verbose': False
#         }

#         catboost = CatBoostClassifier(**catboost_params)
#         catboost.fit(X_train, y_train, verbose=False)
#         y_pred = catboost.predict_proba(X_test)[:, 1]

#         return roc_auc_score(y_test, y_pred)

#     study = optuna.create_study(direction='maximize')
#     study.optimize(objective, n_trials=50, show_progress_bar=True)

#     bests_catboost[f'Best for fold {i} with value {study.best_value} is'] = study.best_params

# print(bests_catboost)

In [23]:
print(f'ROC AUC: {scores.mean()}')

ROC AUC: 0.8796287710132124


In [18]:
sub.head()

Unnamed: 0_level_0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19219,0.5,0.5,0.5,0.5,0.5,0.5,0.5
19220,0.5,0.5,0.5,0.5,0.5,0.5,0.5
19221,0.5,0.5,0.5,0.5,0.5,0.5,0.5
19222,0.5,0.5,0.5,0.5,0.5,0.5,0.5
19223,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [19]:
test = Preprocess().fit_transform(test)

In [None]:
#load the model
# voting = joblib.load('models/voting_model.pkl')

In [20]:
sub[targets] = voting.predict_proba(test)

In [21]:
sub.head()

Unnamed: 0_level_0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19219,0.586493,0.000998,0.001916,0.00051,0.00563,0.115927,0.288527
19220,0.335614,0.013474,0.003075,0.000609,0.11286,0.110975,0.423392
19221,0.060795,0.010713,0.022916,0.000812,0.002904,0.316896,0.584963
19222,0.15118,0.001053,0.000531,0.000871,0.003663,0.387768,0.454934
19223,0.019609,0.00095,0.000445,0.000939,0.001389,0.705122,0.271546


In [22]:
sub.to_csv('data/submission2.csv',index=True)