## **installing packages**

In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install -U scikit-learn
!pip install scipy

Requirement already up-to-date: scikit-learn in /home/tejas/.local/lib/python3.8/site-packages (1.1.2)


## **Importing packages**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sklearn.metrics as sk
import io

In [3]:
data = pd.read_csv('data_banknote_authentication.csv')
data = data.drop_duplicates()

FileNotFoundError: [Errno 2] No such file or directory: 'data_banknote_authentication.csv'

## **Preprocessing the data** 

### **Normalise to remove skew**

In [None]:
df_min_max_scaled = data.copy()
  
col = ["curtosis", "variance", "skewness", "entropy"]
# col = ["targeted_productivity", "smv", "over_time","incentive", "idle_time", "idle_men", "no_of_style_change", "no_of_workers"]
# apply normalization techniques
for column in col :
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
data = df_min_max_scaled

In [None]:
def plot_boxplot(df, ft):
  df.boxplot(column=[ft])
  plt.show()

In [None]:
plot_boxplot(data, 'curtosis')

In [None]:
data["entropy"] = np.exp(np.exp(data["entropy"]))
data["curtosis"] = np.sqrt(data["curtosis"])

In [None]:
df_min_max_scaled = data.copy()
  
col = ["curtosis", "variance", "skewness", "entropy"]
# col = ["targeted_productivity", "smv", "over_time","incentive", "idle_time", "idle_men", "no_of_style_change", "no_of_workers"]
# apply normalization techniques
for column in col :
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
data = df_min_max_scaled

In [None]:
for i in data.columns:
  plt.hist(data[i], bins = 100)
  plt.show()

In [None]:
sns.pairplot(data, hue = "class")

In [None]:
# data.drop(axis = "columns", labels={"variance", "skewness"}, inplace = True)

### **Check for correlation**

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), vmin=-1, cmap="coolwarm", annot=True)

In [None]:
data = data.sample(frac = 1)

In [None]:
y = data["class"]
data.drop(axis = 'columns', labels = "class", inplace = True)
# len(y)

### **Add bias term**

In [None]:
data.insert(0, "bias", 1)

In [None]:
data.describe()

## **Test train split**

In [None]:
train = data[:1098]
trainA = y[:1098]
test = data[1099:]
testA = y[1099:]
train = train.to_numpy()
trainA = trainA.to_numpy()
test = test.to_numpy()
testA = testA.to_numpy()

## **Learn and test the model**

In [None]:
def findW(x, theta):
  #x: m*n, theta:n*1, returns m*1
  return np.dot(x, theta)

In [None]:
a = [1, 2]
b = [3, 4]
c = findW(a, b)
print(c)

In [None]:
def sigmoid(z):
  return 1/(1+np.exp(-z))

In [None]:
m,n = train.shape
print(train)
theta = np.zeros(n)
print(theta)

In [None]:
def compute_cost(theta, x, y):
  wtx = findW(x, theta)
  sig = sigmoid(wtx)
  print(len(sig))
  log1 = np.log(sig)
  log2 = np.log(1-sig)

  f = np.dot(y, log1) + np.dot(1-y, log2)
  f = -1 * f
  j = np.sum(f)/m
  return j

loss = compute_cost(theta, train, trainA)
loss

In [None]:
iterations = 2000
alpha = 0.9

In [None]:
print(theta)
def gradientDescent(x, y, theta, alpha, iterations):
  # x: m*n, y: m*1, theta: n*1
  for i in range(iterations):
    f1 = sigmoid(findW(x, theta))-y
    # print(f1)
    #f1=m*1, infunc1 = m*
    # print(i)
    gradient = np.dot(x.transpose(),f1)
    # print(gradient)
    theta = theta -(alpha/m)*gradient
  return theta
theta = gradientDescent(train, trainA, theta, alpha, iterations)
print(theta)

In [None]:
L = compute_cost(theta, train, trainA)
print(L)

In [None]:
pred = sigmoid(findW(test, theta))
pred

In [None]:
pred = np.round(sigmoid(findW(test, theta)))
pred

## **Evaluate the model**

In [None]:
true_pos = 0; false_pos=0; false_neg=0; true_neg=0;
for i in range(len(testA)):
  if(testA[i] == pred[i] and pred[i] == 1):
      true_pos+=1;
  elif(testA[i] != pred[i] and pred[i] == 1):
      false_pos +=1
  elif(testA[i] !=pred[i] and pred[i] == 0):
    false_neg +=1
  elif(testA[i] == pred[i] and pred[i] == 0):
    true_neg += 1
precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)

accuracy = (true_neg+true_pos)/(false_pos + false_neg + true_neg + true_pos)

f1 = 2*precision*recall/(precision+recall)
print("The f1 score is", f1)
print("The accuracy score is", accuracy)

In [None]:
print("ROC_AUC score is")
sk.roc_auc_score(testA, pred)
