In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../input/flavours-of-physics-kernels-only/training.csv.zip')
test=pd.read_csv("../input/flavours-of-physics-kernels-only/test.csv.zip")
print("train.shape:{} test.shape:{}".format(train.shape, test.shape))

We have 4 additional columns in the train set. One of them is ofcourse the target. We will find out those columns

In [None]:
Extra_columns=[]
for i in train.columns:
    if i not in test.columns:
        Extra_columns.append(i)
print('Additional features in train dataset:  {}'.format(Extra_columns))

The 'signal' column is the target column.

In [None]:
print("Missing values in train: ", train.isnull().sum().sum())
print("Missing values in test: ", test.isnull().sum().sum())

In [None]:
def add_features(data):
    df = data.copy()
    df['NEW_FD_SUMP'] = df['FlightDistance'] / (df['p0_p'] + df['p1_p'] + df['p2_p'])
    df['NEW5_lt'] = df['LifeTime'] * (df['p0_IP'] + df['p1_IP'] + df['p2_IP']) / 3
    df['p_track_Chi2Dof_MAX'] = df.loc[:, ['p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof']].max(axis=1)
    df['flight_dist_sig2'] = (df['FlightDistance'] / df['FlightDistanceError']) ** 2
    df['flight_dist_sig'] = df['FlightDistance'] / df['FlightDistanceError']
    df['NEW_IP_dira'] = df['IP'] * df['dira']
    df['p0p2_ip_ratio'] = df['IP'] / df['IP_p0p2']
    df['p1p2_ip_ratio'] = df['IP'] / df['IP_p1p2']
    df['DCA_MAX'] = df.loc[:, ['DOCAone', 'DOCAtwo', 'DOCAthree']].max(axis=1)
    df['iso_bdt_min'] = df.loc[:, ['p0_IsoBDT', 'p1_IsoBDT', 'p2_IsoBDT']].min(axis=1)
    df['iso_min'] = df.loc[:, ['isolationa', 'isolationb', 'isolationc','isolationd', 'isolatione', 'isolationf']].min(axis=1)
    df['NEW_iso_abc'] = df['isolationa'] * df['isolationb'] * df['isolationc']
    df['NEW_iso_def'] = df['isolationd'] * df['isolatione'] * df['isolationf']
    df['NEW_pN_IP'] = df['p0_IP'] + df['p1_IP'] + df['p2_IP']
    df['NEW_pN_p']  = df['p0_p'] + df['p1_p'] + df['p2_p']
    df['NEW_IP_pNpN'] = df['IP_p0p2'] * df['IP_p1p2']
    df['NEW_pN_IPSig'] = df['p0_IPSig'] + df['p1_IPSig'] + df['p2_IPSig']
    df['NEW_FD_LT'] = df['FlightDistance'] / df['LifeTime']
    return df

In [None]:
train=add_features(train)
test=add_features(test)
print('number of train features:',train.shape[1])
print('number of test features:',test.shape[1])

In [None]:
eliminate = ['id', 'min_ANNmuon', 'production', 'mass', 'signal',
              'SPDhits','CDF1', 'CDF2', 'CDF3',
              'isolationb', 'isolationc','p0_pt', 'p1_pt', 'p2_pt',
              'p0_p', 'p1_p', 'p2_p', 'p0_eta', 'p1_eta', 'p2_eta',
              'isolationa', 'isolationb', 'isolationc', 'isolationd', 'isolatione', 'isolationf',
              'p0_IsoBDT', 'p1_IsoBDT', 'p2_IsoBDT',
              'p0_IP', 'p1_IP', 'p2_IP',
              'IP_p0p2', 'IP_p1p2',
              'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof',
              'p0_IPSig', 'p1_IPSig', 'p2_IPSig',
              'DOCAone', 'DOCAtwo', 'DOCAthree']

In [None]:
y=train['signal']

In [None]:
train=train.drop(columns=eliminate)
eliminate = ['id','SPDhits','CDF1', 'CDF2', 'CDF3',
              'isolationb', 'isolationc','p0_pt', 'p1_pt', 'p2_pt',
              'p0_p', 'p1_p', 'p2_p', 'p0_eta', 'p1_eta', 'p2_eta',
              'isolationa', 'isolationb', 'isolationc', 'isolationd', 'isolatione', 'isolationf',
              'p0_IsoBDT', 'p1_IsoBDT', 'p2_IsoBDT',
              'p0_IP', 'p1_IP', 'p2_IP',
              'IP_p0p2', 'IP_p1p2',
              'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof',
              'p0_IPSig', 'p1_IPSig', 'p2_IPSig',
              'DOCAone', 'DOCAtwo', 'DOCAthree']
test=test.drop(columns=eliminate)

In [None]:
#features = list(f for f in train.columns if f not in eliminate)

In [None]:
#union = FeatureUnion([("scaler",StandardScaler()),("pca", PCA())])

In [None]:
#Xtrain=union.fit_transform(Xtrain)
#Xtest=union.fit_transform(Xtest)
scaler=StandardScaler()
train = scaler.fit_transform(train)
test  = scaler.fit_transform(test)


In [None]:
pca = PCA(0.96).fit(train)
train = pca.transform(train)
test = pca.transform(test)

In [None]:
print('Shape of train:{}'.format(train.shape))
print('Shape of test:{}'.format(test.shape))


In [None]:
Xtrain,Xval,ytrain,yval=train_test_split(train,y,test_size=0.05)

print('Shape of Xtrain:{}'.format(Xtrain.shape))
print('Shape of Xval:{}'.format(Xval.shape))

In [None]:
from tensorflow.keras.layers import Input,Dense,Dropout,GRU
from tensorflow.keras.models import Model

In [None]:
i=Input(shape=(16,))
x=Dense(128,kernel_initializer='he_normal',activation='relu')(i)
x=Dense(64,kernel_initializer='he_normal',activation='relu')(x)
x=Dropout(0.3)(x)
x=Dense(32,kernel_initializer='he_normal',activation='relu')(x)
x=Dropout(0.3)(x)
x=Dense(16,kernel_initializer='he_normal',activation='relu')(x)
x=Dense(1,kernel_initializer='he_normal',activation='sigmoid')(x)
model=Model(i, x)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
print(model.summary())

In [None]:
r=model.fit(Xtrain,ytrain,validation_data=(Xval,yval),epochs=10,batch_size=32)

In [None]:
#plot loss per iteration
import matplotlib.pyplot as plt
plt.plot(r.history['loss'],label='loss')
plt.plot(r.history['val_loss'],label='val_loss')
plt.legend()

In [None]:
#plot accuracy per iteration
import matplotlib.pyplot as plt
plt.plot(r.history['accuracy'],label='accuracy')
plt.plot(r.history['val_accuracy'],label='val_accuracy')
plt.legend

In [None]:
pred=model.predict(test)

In [None]:
pred[0:5]

In [None]:
for i in pred:
    if i < 0.5:
        i==0
    else:
        i==1

In [None]:
pred[0:5]