## Group 13 - IDC 409 Term Project

Members:
* Tushar Baruah (MS19015)
* Atharva Hingane (MS19043)
* Nitish (MS19101)

### Project 1: Continuum Supression in HEP

We need to classify 'type' 0 and 1 in our data as signal and the others as background.

1. Our data has 59 independent parameters. We perform PCA to first reduce them.
2. We perform logistic regression to classify the data as signal or background.

data URL = <https://drive.google.com/file/d/1wceoRWDkqdXrAAt6x6Kuf9Dv3alW3iad/view?usp=drive_link>

GITHUB link = <https://github.com/TusharBaruah/Binary-Classification>

In [18]:
# Importing data and performing PCA

# data URL = https://drive.google.com/file/d/1wceoRWDkqdXrAAt6x6Kuf9Dv3alW3iad/view?usp=drive_link
# GITHUB link = https://github.com/TusharBaruah/Binary-Classification

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

Data = pd.read_csv('data_hep.csv')
Y_labels = Data['type'].to_numpy()

# re-labelling the data to signal (1) and background (0)
for i in range(0,len(Y_labels)):
    if(Y_labels[i]==0 or Y_labels[i]==1):
        Y_labels[i]=1.0 # signal
    else:
        Y_labels[i]=0.0 # background

Data = Data.drop('type',axis=1) # removing previous 'type' label
Data['type']=Y_labels # adding the signal and background label
X_set = Data.drop(columns=["Unnamed: 0","type"])
X_set = X_set.to_numpy()

pca= PCA(n_components=32)
result=pca.fit(X_set)
X_pca= result.transform(X_set)
print("Before PCA:",X_set.shape,"\nAfter PCA:",X_pca.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_labels, test_size = 0.3, random_state=0)
print("X_train:",X_train.shape)
print("X_test:",X_test.shape)
print("Y_train:",Y_train.shape)
print("Y_test:",Y_test.shape)

Before PCA: (70606, 59) 
After PCA: (70606, 32)
X_train: (49424, 32)
X_test: (21182, 32)
Y_train: (49424,)
Y_test: (21182,)


In [19]:
# Performing Logistic Regression

import time
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import math 

start = time.time()

def sigmoid(x):
    x = np.float64(x)
    y = 1 / (1 + np.exp(-x))
    return y

# Function to compute cost  - using vectorization
def cost_logreg_vec(X, y, w, b):
    m, n = X.shape
    if(len(y)!=m or len(w) !=n):
        print("Dataset array and Labels/weights array size does not match")
    z = np.matmul(X, w) + (b * np.ones(m))
    y_dash = sigmoid(z)
    loss_vec = np.array([- (y[i] * np.log(y_dash[i])) - ((1 - y[i]) * np.log(1 - y_dash[i])) for i in range(m)])
    cost = np.dot(loss_vec, np.ones(m)) / m
    # cost = cost_func_vec(y, y_dash)
    return cost

def grad_logreg_vec(X, y, w, b): 
    m, n = X.shape
    if(len(y)!=m or len(w) !=n):
        print("Dataset array and Labels/weights array size does not match")
    y_dash = sigmoid(np.matmul(X, w) + b * np.ones(m))
    grad_w = np.matmul(y_dash - y, X) / m
    grad_b = np.dot(y_dash - y, np.ones(m)) / m
    
    return grad_w, grad_b

# Gradient descent algorithm for logistic regression
def grad_desc(X, y, w, b, learning_rate, n_iter, show_cost = True): 
    m, n = X.shape
    if(len(y)!=m or len(w) !=n):
        print("Dataset array and Labels/weights array size does not match")
    cost_history, params_history = [], []

    for i in range(n_iter):
        grad_w, grad_b = grad_logreg_vec(X, y, w, b)   
        w += - learning_rate * grad_w
        b += - learning_rate * grad_b
        cost =  cost_logreg_vec(X, y, w, b)
        cost_history.append(cost)
        params_history.append([w, b])
        if show_cost == True and (i == n_iter - 1):
            print("Iteration",i,",Cost:",float(cost_history[i]))
        
    return w, b, cost_history, params_history


# Learning model parameters using gradient descent algorithm
a=np.zeros(X_train.shape[1])

w_out, b_out, cost_history, params_history = grad_desc(X_train,
                                                       Y_train,
                                                       a,
                                                       0,
                                                       learning_rate = 0.1,
                                                       n_iter = 2000)

# Prediction and evaluation on the training set and the test set
y_train_prob = sigmoid(np.matmul(X_train, w_out) + (b_out * np.ones(X_train.shape[0])))
y_test_prob = sigmoid(np.matmul(X_test, w_out) + (b_out * np.ones(X_test.shape[0])))
y_train_pred, y_test_pred = (y_train_prob > 0.5).astype(int), (y_test_prob > 0.5).astype(int)

end = time.time()
print("Time taken by the algorithm=",end-start,"seconds")

print("Training:\n",classification_report(Y_train, y_train_pred))
print("Testing:\n",classification_report(Y_test, y_test_pred))

cm = confusion_matrix(Y_test, y_test_pred)
print('Confusion matrix\n', cm)
print('True Positives(TP) = ', cm[0,0])
print('True Negatives(TN) = ', cm[1,1])
print('False Positives(FP) = ', cm[0,1])
print('False Negatives(FN) = ', cm[1,0])

  loss_vec = np.array([- (y[i] * np.log(y_dash[i])) - ((1 - y[i]) * np.log(1 - y_dash[i])) for i in range(m)])
  loss_vec = np.array([- (y[i] * np.log(y_dash[i])) - ((1 - y[i]) * np.log(1 - y_dash[i])) for i in range(m)])


Iteration 1999 ,Cost: nan
Time taken by the algorithm= 260.42791986465454 seconds
Training:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83     24247
           1       0.83      0.85      0.84     25177

    accuracy                           0.83     49424
   macro avg       0.83      0.83      0.83     49424
weighted avg       0.83      0.83      0.83     49424

Testing:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83     10594
           1       0.82      0.84      0.83     10588

    accuracy                           0.83     21182
   macro avg       0.83      0.83      0.83     21182
weighted avg       0.83      0.83      0.83     21182

Confusion matrix
 [[8682 1912]
 [1672 8916]]
True Positives(TP) =  8682
True Negatives(TN) =  8916
False Positives(FP) =  1912
False Negatives(FN) =  1672


## Comparing with other existing models

K-nearest Neighbours

In [22]:
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_labels, test_size = 0.3, random_state=0)
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)

start = time.time()
knn.fit(X_train, Y_train)
end = time.time()
print("Time taken by the algorithm=",end-start,"seconds")

print("Shape of test",X_test.shape)
print("score on train: "+ str(knn.score(X_train, Y_train)))
print("score on test: " + str(knn.score(X_test, Y_test)))

Time taken by the algorithm= 0.005001544952392578 seconds
score on train: 0.8701845257364843
score on test: 0.8148427910490039


In-built logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression
import time
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_labels, test_size = 0.3, random_state=0)

start = time.time()
lr=LogisticRegression(max_iter=5000)
lr.fit(X_train, Y_train)
end = time.time()
print("Time taken by the algorithm=",end-start,"seconds")

print("Shape of test",X_test.shape)
print("score on train: "+ str(lr.score(X_train, Y_train)))
print("score on test: " + str(lr.score(X_test, Y_test)))

Time taken by the algorithm= 1.9510478973388672 seconds
Shape of test (21182, 32)
score on train: 0.8379329880220135
score on test: 0.8368426022094231


Support Vector Machine

In [27]:
from sklearn.svm import LinearSVC
import time
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_labels, test_size = 0.3, random_state=0)

start = time.time()
svm=LinearSVC(C=0.0001)
svm.fit(X_train, Y_train)
end = time.time()
print("Time taken by the algorithm=",end-start,"seconds")

print("Shape of test",X_test.shape)
print("score on train: "+ str(svm.score(X_train, Y_train)))
print("score on test: " + str(svm.score(X_test, Y_test)))



Time taken by the algorithm= 3.494051694869995 seconds
Shape of test (21182, 32)
score on train: 0.8234663321463257
score on test: 0.8256066471532433


Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_labels, test_size = 0.3, random_state=0)

start = time.time()
clf = DecisionTreeClassifier(min_samples_split=10,max_depth=3)
clf.fit(X_train, Y_train)
end = time.time()
print("Time taken by the algorithm=",end-start,"seconds")

print("Shape of test",X_test.shape)
print("score on train: " + str(clf.score(X_train, Y_train)))
print("score on test: "  + str(clf.score(X_test, Y_test)))

Time taken by the algorithm= 0.8829154968261719 seconds
Shape of test (21182, 32)
score on train: 0.7602379410812561
score on test: 0.7552639033141346
