# Data Preprocessing

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

## Importing the dataset

In [None]:
data_train = pd.read_csv('train.csv')
X_train = data_train.iloc[:, 1:].values

In [None]:
data_test = pd.read_csv('test.csv')
X_test = data_test.iloc[:, 1:].values

In [None]:
print(X_train)

[['n673gat' 5695.0 5698.0 5707.0 5988.0 6005.0]
 ['n398gat' 4909.0 4911.0 4919.0 5071.0 5083.0]
 ['n402gat' 4982.0 4987.0 4997.0 5142.0 5153.0]
 ...
 ['n1462gat' 0.0 0.0 0.0 0.0 0.0]
 ['n1596gat' 0.0 0.0 0.0 0.0 0.0]
 ['n1588gat' 5001.0 5003.0 5009.0 5009.0 5012.0]]


##Generating labels

In [None]:
no_of_ff = 179 # As here we are using bench S5378

In [None]:
print(X_train.shape)
print(X_test.shape)

(322200, 6)
(107400, 6)


In [None]:
# Actual labels
y_train = [1 for i in range(1000)]
for i in range(800):
  y_train.append(0)
y_test = [1 for i in range(400)]
for i in range(200):
  y_test.append(0)

In [None]:
# Expanded Labels
y_train_expanded = [1 for i in range(1000 * no_of_ff)]
for i in range(800 * no_of_ff):
  y_train_expanded.append(0)
y_test_expanded = [1 for i in range(400 * no_of_ff)]
for i in range(200 * no_of_ff):
  y_test_expanded.append(0)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_train_expanded = np.array(y_train_expanded)
y_test_expanded = np.array(y_test_expanded)

In [None]:
print(y_train.shape, y_test.shape)
print(y_train_expanded.shape, y_test_expanded.shape)

(1800,) (600,)
(322200,) (107400,)


## Encoding features

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train[:, 0] = le.fit_transform(X_train[:, 0]) 
X_test[:, 0] = le.transform(X_test[:, 0])

In [None]:
print(X_train)

[[156 5695.0 5698.0 5707.0 5988.0 6005.0]
 [141 4909.0 4911.0 4919.0 5071.0 5083.0]
 [142 4982.0 4987.0 4997.0 5142.0 5153.0]
 ...
 [25 0.0 0.0 0.0 0.0 0.0]
 [33 0.0 0.0 0.0 0.0 0.0]
 [32 5001.0 5003.0 5009.0 5009.0 5012.0]]


# Visualizing our dataset

In [None]:
training_x= pd.DataFrame(X_train)
training_y= pd.DataFrame(y_train_expanded,columns=['output'])

In [None]:
training_x

Unnamed: 0,0,1,2,3,4,5
0,156,5695.0,5698.0,5707.0,5988.0,6005.0
1,141,4909.0,4911.0,4919.0,5071.0,5083.0
2,142,4982.0,4987.0,4997.0,5142.0,5153.0
3,176,4978.0,4980.0,4982.0,5146.0,5157.0
4,172,5060.0,5063.0,5066.0,5226.0,5237.0
...,...,...,...,...,...,...
322195,24,9999.0,10001.0,10003.0,12002.0,22001.0
322196,29,9375.0,9377.0,9379.0,11254.0,11284.0
322197,25,0.0,0.0,0.0,0.0,0.0
322198,33,0.0,0.0,0.0,0.0,0.0


In [None]:
dataframe = pd.concat([training_x, training_y], axis=1)

In [None]:
dataframe

Unnamed: 0,0,1,2,3,4,5,output
0,156,5695.0,5698.0,5707.0,5988.0,6005.0,1
1,141,4909.0,4911.0,4919.0,5071.0,5083.0,1
2,142,4982.0,4987.0,4997.0,5142.0,5153.0,1
3,176,4978.0,4980.0,4982.0,5146.0,5157.0,1
4,172,5060.0,5063.0,5066.0,5226.0,5237.0,1
...,...,...,...,...,...,...,...
322195,24,9999.0,10001.0,10003.0,12002.0,22001.0,0
322196,29,9375.0,9377.0,9379.0,11254.0,11284.0,0
322197,25,0.0,0.0,0.0,0.0,0.0,0
322198,33,0.0,0.0,0.0,0.0,0.0,0


In [None]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322200 entries, 0 to 322199
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       322200 non-null  object
 1   1       322200 non-null  object
 2   2       322200 non-null  object
 3   3       322200 non-null  object
 4   4       322200 non-null  object
 5   5       322200 non-null  object
 6   output  322200 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 17.2+ MB


# Various classification techniques

In [None]:
accuracy={}

In [None]:
def get_Actual_values(nums):
  total = len(nums) // no_of_ff

  cur_row = 0
  pred = []
  for repeat in range(total):
    is_trojan = False
    for i in range(no_of_ff):
      if (nums[cur_row] == 1):
        is_trojan = True
      cur_row += 1
    if (is_trojan):
      pred.append(1)
    else:
      pred.append(0)
  
  pred = np.array(pred)
  return pred

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train_expanded) 

LogisticRegression()

In [None]:
y_pred_expanded = LR.predict(X_test)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan


Confusion Matrix
[[  0 200]
 [  0 400]]


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_train, y_train_expanded)

GaussianNB()

In [None]:
y_pred_expanded = NB.predict(X_test)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan

Confusion Matrix
[[  0 200]
 [  0 400]]


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(X_train, y_train_expanded)

DecisionTreeClassifier(criterion='entropy')

In [None]:
y_pred_expanded = dtc.predict(X_test)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan

Confusion Matrix
[[  0 200]
 [  0 400]]


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
rfc.fit(X_train, y_train_expanded)

RandomForestClassifier(criterion='entropy')

In [None]:
y_pred_expanded = rfc.predict(X_test)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan

Confusion Matrix
[[  0 200]
 [  0 400]]


## ROC Curve

In [None]:
step_factor = 0.005
threshold_value = 0.01 
roc_score=0
predicted_proba = rfc.predict_proba(X_train) #probability of prediction
while threshold_value <=0.8: #continue to check best threshold upto probability 0.8
    temp_thresh = threshold_value
    predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
    print('Threshold',temp_thresh,'--',roc_auc_score(y_train_expanded, predicted))
    if roc_score<roc_auc_score(y_train_expanded, predicted): #store the threshold for best classification
        roc_score = roc_auc_score(y_train_expanded, predicted)
        thrsh_score = threshold_value
    threshold_value = threshold_value + step_factor
print('---Optimum Threshold ---',thrsh_score,'--ROC--',roc_score)

Threshold 0.01 -- 0.5
Threshold 0.015 -- 0.5
Threshold 0.02 -- 0.5
Threshold 0.025 -- 0.5
Threshold 0.030000000000000002 -- 0.5
Threshold 0.035 -- 0.5
Threshold 0.04 -- 0.5
Threshold 0.045 -- 0.5
Threshold 0.049999999999999996 -- 0.5
Threshold 0.05499999999999999 -- 0.5
Threshold 0.05999999999999999 -- 0.5
Threshold 0.06499999999999999 -- 0.5
Threshold 0.06999999999999999 -- 0.5
Threshold 0.075 -- 0.5
Threshold 0.08 -- 0.5
Threshold 0.085 -- 0.5
Threshold 0.09000000000000001 -- 0.5
Threshold 0.09500000000000001 -- 0.5
Threshold 0.10000000000000002 -- 0.5
Threshold 0.10500000000000002 -- 0.5
Threshold 0.11000000000000003 -- 0.5
Threshold 0.11500000000000003 -- 0.5
Threshold 0.12000000000000004 -- 0.5
Threshold 0.12500000000000003 -- 0.5
Threshold 0.13000000000000003 -- 0.5
Threshold 0.13500000000000004 -- 0.5
Threshold 0.14000000000000004 -- 0.5
Threshold 0.14500000000000005 -- 0.5
Threshold 0.15000000000000005 -- 0.5
Threshold 0.15500000000000005 -- 0.5
Threshold 0.16000000000000006 --

In [None]:
y_pred_expanded_proba = rfc.predict_proba(X_test)

In [None]:
y_pred_expanded = []
for i in y_pred_expanded_proba:
  if (i[1] >= thrsh_score):
    y_pred_expanded.append(1)
  else:
    y_pred_expanded.append(0)

In [None]:
y_pred_expanded = np.array(y_pred_expanded)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan

Confusion Matrix
[[  0 200]
 [ 50 350]]


# XGBOOST

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier
classifier_boost = XGBClassifier(n_estimators=100)
classifier_boost.fit(X_train, y_train_expanded)

XGBClassifier()

In [None]:
step_factor = 0.005
threshold_value = 0.01 
roc_score=0
predicted_proba = classifier_boost.predict_proba(X_train) #probability of prediction
while threshold_value <=0.8: #continue to check best threshold upto probability 0.8
    temp_thresh = threshold_value
    predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
    print('Threshold',temp_thresh,'--',roc_auc_score(y_train_expanded, predicted))
    if roc_score<roc_auc_score(y_train_expanded, predicted): #store the threshold for best classification
        roc_score = roc_auc_score(y_train_expanded, predicted)
        thrsh_score = threshold_value
    threshold_value = threshold_value + step_factor
print('---Optimum Threshold ---',thrsh_score,'--ROC--',roc_score)

Threshold 0.01 -- 0.5
Threshold 0.015 -- 0.5
Threshold 0.02 -- 0.5
Threshold 0.025 -- 0.5
Threshold 0.030000000000000002 -- 0.5
Threshold 0.035 -- 0.5
Threshold 0.04 -- 0.5
Threshold 0.045 -- 0.5
Threshold 0.049999999999999996 -- 0.5
Threshold 0.05499999999999999 -- 0.5
Threshold 0.05999999999999999 -- 0.5
Threshold 0.06499999999999999 -- 0.5
Threshold 0.06999999999999999 -- 0.5
Threshold 0.075 -- 0.5
Threshold 0.08 -- 0.5
Threshold 0.085 -- 0.5
Threshold 0.09000000000000001 -- 0.5
Threshold 0.09500000000000001 -- 0.5
Threshold 0.10000000000000002 -- 0.5
Threshold 0.10500000000000002 -- 0.5
Threshold 0.11000000000000003 -- 0.5
Threshold 0.11500000000000003 -- 0.5
Threshold 0.12000000000000004 -- 0.5
Threshold 0.12500000000000003 -- 0.5
Threshold 0.13000000000000003 -- 0.5
Threshold 0.13500000000000004 -- 0.5
Threshold 0.14000000000000004 -- 0.5
Threshold 0.14500000000000005 -- 0.5
Threshold 0.15000000000000005 -- 0.5
Threshold 0.15500000000000005 -- 0.5
Threshold 0.16000000000000006 --

In [None]:
y_pred_expanded_proba = rfc.predict_proba(X_test)

In [None]:
y_pred_expanded = []
for i in y_pred_expanded_proba:
  if (i[1] >= thrsh_score):
    y_pred_expanded.append(1)
  else:
    y_pred_expanded.append(0)

In [None]:
y_pred_expanded = np.array(y_pred_expanded)
y_pred = get_Actual_values(y_pred_expanded)

In [None]:
print('Confusion Matrix') 
print(confusion_matrix(y_test, y_pred))
# TN FP
# FN TP
# Positive - Trojan

Confusion Matrix
[[  0 200]
 [  0 400]]


# SVM 

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svclassifier = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
svclassifier.fit(X_train, y_train_expanded)

In [None]:
y_pred_expanded = svclassifier.predict(X_test)
y_pred = get_Actual_values(y_pred_expanded)