In [2]:
import pandas as pd 
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.decomposition import *
import matplotlib.pyplot as plt


In [3]:
!pip install prince

Collecting prince
  Downloading https://files.pythonhosted.org/packages/51/f4/8de7003b86351a0e32e29ca2bbbbbf58e311b09f9286e83e638d437aee6d/prince-0.7.0-py3-none-any.whl
Installing collected packages: prince
Successfully installed prince-0.7.0


In [4]:
!pip install catboost==0.23.2

Collecting catboost==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/b2/aa/e61819d04ef2bbee778bf4b3a748db1f3ad23512377e43ecfdc3211437a0/catboost-0.23.2-cp36-none-manylinux1_x86_64.whl (64.8MB)
[K     |████████████████████████████████| 64.8MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.23.2


In [5]:
import prince
from catboost import CatBoostClassifier

In [6]:
def my_groupby(df,primary_keys,dictionary_ops,renaming_dict):
    '''
        primary_keys is a list of primary keys.
        dictionary_ops is the dictionay having the operations to be performed (example :- {'location_number':'count'})
        renaming_dict is the column to be renamed after joining and resetting index
    '''
    return df.groupby(primary_keys).agg(dictionary_ops).reset_index().rename(columns=renaming_dict)


In [7]:
def data_left_join(df1,df2,primary_key):
    '''
        df1 :- First dataframe
        df2 :- Second Dataframe
        primary_key :- The list of primary keys on which one needs to left join
    '''
    return df1.merge(df2,how='left',on=primary_key)

In [8]:
def updated_df(df,primary_key,operation,columns):
    for cols in columns:
        print('Aggregate ',operation ,' on column- ',cols)
        df       = data_left_join(df,
                                   my_groupby(df,
                                              [primary_key],
                                              {cols:operation},
                                              {cols:primary_key+'_'+operation+'_'+cols}),
                                   primary_key)

    return df

In [9]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
train_y = train_data.Class.values
train_data = train_data.drop(['Class'],axis = 1)

In [10]:
concat_df                          = pd.concat((train_data,test_data),axis=0)
cat_cols                           = ['Area_Code','Locality_Code','Region_Code','Species']
mca                                = prince.MCA(n_components=1,random_state=202020).fit(concat_df[cat_cols])
train_data.loc[:,'mca_cat1']        = mca.transform(train_data[cat_cols])[0]
test_data.loc[:,'mca_cat1']         = mca.transform(test_data[cat_cols])[0]


In [11]:
num_cols                           = ['Height','Diameter']
pca                                = PCA(n_components=1,random_state=202020).fit(concat_df[num_cols])
train_data.loc[:,'pca_num']        = pca.transform(train_data[num_cols])[:,0]
test_data.loc[:,'pca_num']         = pca.transform(test_data[num_cols])[:,0]


In [12]:
del concat_df
concat_df                          = pd.concat((train_data,test_data),axis=0)
concat_df['EFB1']                  = concat_df['Locality_Code'].astype(str)+'_'+concat_df['Species'].astype(str)
concat_df['EFB2']                  = concat_df['Locality_Code'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB3']                  = concat_df['Species'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB4']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Region_Code'].astype(str)
concat_df['EFB5']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Locality_Code'].astype(str)
concat_df['EFB6']                  = concat_df['Area_Code'].astype(str)+'_'+concat_df['Species'].astype(str)


In [13]:
concat_df['ratio_height_diam']     = np.where(concat_df['Diameter']!=0,concat_df['Height']/concat_df['Diameter'],np.NAN)
aggregation_columns                = ['Height','Diameter','mca_cat1','pca_num','ratio_height_diam']
numerical_aggregation_primary_keys = ['Area_Code','Locality_Code','Region_Code','Species']

for cols in numerical_aggregation_primary_keys:
    print(cols)
    concat_df                       = updated_df(concat_df,cols,'mean',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'std',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'min',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'max',aggregation_columns)
    concat_df                       = updated_df(concat_df,cols,'median',aggregation_columns)
    print('\n')

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Species'])
concat_df                          = updated_df(concat_df,'Locality_Code','nunique',['Species'])
concat_df                          = updated_df(concat_df,'Region_Code','nunique',['Species'])

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Locality_Code'])
concat_df                          = updated_df(concat_df,'Region_Code','nunique',['Locality_Code'])
concat_df                          = updated_df(concat_df,'Species','nunique',['Locality_Code'])

concat_df                          = updated_df(concat_df,'Area_Code','nunique',['Region_Code'])
concat_df                          = updated_df(concat_df,'Locality_Code','nunique',['Region_Code'])
concat_df                          = updated_df(concat_df,'Species','nunique',['Region_Code'])


Area_Code
Aggregate  mean  on column-  Height
Aggregate  mean  on column-  Diameter
Aggregate  mean  on column-  mca_cat1
Aggregate  mean  on column-  pca_num
Aggregate  mean  on column-  ratio_height_diam
Aggregate  std  on column-  Height
Aggregate  std  on column-  Diameter
Aggregate  std  on column-  mca_cat1
Aggregate  std  on column-  pca_num
Aggregate  std  on column-  ratio_height_diam
Aggregate  min  on column-  Height
Aggregate  min  on column-  Diameter
Aggregate  min  on column-  mca_cat1
Aggregate  min  on column-  pca_num
Aggregate  min  on column-  ratio_height_diam
Aggregate  max  on column-  Height
Aggregate  max  on column-  Diameter
Aggregate  max  on column-  mca_cat1
Aggregate  max  on column-  pca_num
Aggregate  max  on column-  ratio_height_diam
Aggregate  median  on column-  Height
Aggregate  median  on column-  Diameter
Aggregate  median  on column-  mca_cat1
Aggregate  median  on column-  pca_num
Aggregate  median  on column-  ratio_height_diam


Locality_Code

In [14]:
testcount                          = len(test_data)
count                              = len(concat_df)-testcount

train_data                         = concat_df[:count]
test_data                          = concat_df[count:]

##### We identify categorical columns here
cat_cols                           = ['Area_Code','Locality_Code','Region_Code','Species','EFB1','EFB2','EFB3','EFB4','EFB5','EFB6']
for cols in cat_cols:
    train_data[cols]               = train_data[cols].astype(str)
    test_data[cols]                = test_data[cols].astype(str)
    
train                              = train_data.values
test                               = test_data.values
cate_features_index                = np.where(train_data.dtypes == object)[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [15]:
oof_pred               = np.zeros((len(train),8))
y_pred_final           = np.zeros((len(test), 8))
num_models             = 2

n_splits               = 44
error                  = []
kf                     = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
    wghts              = [0]*num_models
    logloss            = []
    
    X_train, X_val     = train[tr_ind], train[val_ind]
    y_train, y_val     = train_y[tr_ind], train_y[val_ind]
    
    
    
    model1             = CatBoostClassifier(n_estimators=1200,random_state=202020,verbose=False,task_type='GPU')
    model1.fit(X_train,y_train,cat_features = cate_features_index,eval_set=(X_val,y_val))
    val_pred1          = model1.predict_proba(X_val)
    logloss.append(log_loss(y_val,val_pred1))
    print('validation logloss model 1 fold-',fold+1,': ',log_loss(y_val,val_pred1))
    
    
    model2             = CatBoostClassifier(n_estimators=1000,random_state=202020,verbose=False,task_type='GPU')
    model2.fit(X_train,y_train,cat_features = cate_features_index,eval_set=(X_val,y_val))
    val_pred2          = model2.predict_proba(X_val)
    logloss.append(log_loss(y_val,val_pred2))
    print('validation logloss model 2 fold-',fold+1,': ',log_loss(y_val,val_pred2))
    
    
    wghts              = np.exp(-1000*np.array(logloss/sum(logloss)))
    wghts              = wghts/sum(wghts)
    
    val_pred           = wghts[0]*val_pred1+wghts[1]*val_pred2
    print('validation logloss fold-',fold+1,': ',log_loss(y_val, val_pred))
    
    oof_pred[val_ind]  = val_pred
    
    y_pred_final += (wghts[0]*model1.predict_proba(test)+wghts[1]*model2.predict_proba(test))/(n_splits)
    
    print('\n')
    
print('OOF logloss:- ',(log_loss(train_y,oof_pred)))


validation logloss model 1 fold- 1 :  0.7871682752555085
validation logloss model 2 fold- 1 :  0.7702258510455616
validation logloss fold- 1 :  0.7702258524887252


validation logloss model 1 fold- 2 :  0.7154910619170367
validation logloss model 2 fold- 2 :  0.7267564598881723
validation logloss fold- 2 :  0.7154860908582914


validation logloss model 1 fold- 3 :  0.6446262604111138
validation logloss model 2 fold- 3 :  0.6384315934852578
validation logloss fold- 3 :  0.6383101127168538


validation logloss model 1 fold- 4 :  0.6969632414941853
validation logloss model 2 fold- 4 :  0.6975267473643445
validation logloss fold- 4 :  0.6905018915495782


validation logloss model 1 fold- 5 :  0.7248437692521489
validation logloss model 2 fold- 5 :  0.7424126026651026
validation logloss fold- 5 :  0.7248437145819717


validation logloss model 1 fold- 6 :  0.717518705854868
validation logloss model 2 fold- 6 :  0.735519046641254
validation logloss fold- 6 :  0.7175186752378697


validation l