In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("C:\\Users\\AK\\Downloads\\binary_classification_train.csv")
data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Class
0,1,-36.963099,-15.014703,87.100435,101.52336,82.106571,-10.95618,0.592326,-51.919028,-90.650434,...,-81.183744,13.101921,18.05145,-40.606288,-39.697053,-13.870802,173.722987,-17.349169,-82.550844,0
1,2,-43.715674,18.847116,89.543406,-71.319314,35.597052,126.35857,-29.837495,-40.473764,-94.079238,...,-37.84826,7.457352,-77.420742,53.773718,-100.124294,9.87065,11.592519,-106.123605,-92.796421,1
2,3,-30.73755,-63.729643,106.081332,81.773948,112.769976,-12.425351,-29.913286,-41.7712,16.424511,...,-106.610289,5.930143,-30.177083,-138.969234,-56.054914,-12.790661,164.832498,-37.412902,-85.44115,0
3,4,-27.674757,-118.869495,135.605213,99.130189,50.947548,-63.704785,-7.353057,-58.140229,-80.209027,...,-149.056417,3.893419,-74.100869,-47.659832,-48.209817,-36.264323,59.001922,-59.064134,-78.538639,0
4,5,-28.654141,-77.746597,85.215365,50.374774,79.763207,-32.703048,-28.152031,-63.994794,-153.566789,...,-102.731465,11.160205,-13.395073,9.416237,2.649524,-43.578704,52.261888,-66.081738,-80.32511,0


In [3]:
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [4]:
split_length = int(0.8 * len(y))
X_train, X_dev = X[:split_length], X[split_length:]
y_train, y_dev = y[:split_length], y[split_length:]

In [5]:
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0)
X_train = (X_train - X_train_mean) / X_train_std
X_dev = (X_dev - X_train_mean) / X_train_std

In [6]:
X_train = X_train.astype(np.float32)
X_dev = X_dev.astype(np.float32)
y_train = y_train.astype(np.float32).values.reshape(-1, 1)
y_dev = y_dev.astype(np.float32).values.reshape(-1, 1)

In [7]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

In [8]:
def cost_function(X, y, W, b):
    m, n = X.shape
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    cost = (-1 / m) * np.sum(y * np.log(A + 1e-8) + (1 - y) * np.log(1 - A + 1e-8))
    dW = (1 / m) * np.dot(X.T, A - y)
    db = (1 / m) * np.sum(A - y)
    return cost, dW, db

In [9]:
def model(X, y, iterations, alpha):
    m, n = X.shape
    np.random.seed(42)
    W = np.random.randn(n, 1) * 0.01
    b = 0
    cost_history = []
    for i in range(iterations):
        cost, dW, db = cost_function(X, y, W, b)
        W -= alpha * dW
        b -= alpha * db
        cost_history.append(cost)
        if (i + 1) % 200 == 0:
            print(f"Iteration: {i + 1}/{iterations} -- cost: {cost}")
    return cost_history, W, b

In [10]:
cost_history, W, b = model(X_train, y_train, iterations=2000, alpha=0.5)

Iteration: 200/2000 -- cost: 0.2470801183920775
Iteration: 400/2000 -- cost: 0.24702990382294346
Iteration: 600/2000 -- cost: 0.24702936397630842
Iteration: 800/2000 -- cost: 0.24702935737168694
Iteration: 1000/2000 -- cost: 0.24702935728943412
Iteration: 1200/2000 -- cost: 0.24702935728829484
Iteration: 1400/2000 -- cost: 0.24702935728826653
Iteration: 1600/2000 -- cost: 0.2470293572882646
Iteration: 1800/2000 -- cost: 0.24702935728826445
Iteration: 2000/2000 -- cost: 0.24702935728826445


In [11]:
def predict(X, W, b, threshold=0.5):
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    return (A >= threshold).astype(int)

In [12]:
def accuracy(y_true, y_pred):
    return np.mean(y_pred == y_true)

In [13]:
y_pred_train = predict(X_train, W, b)
y_pred_dev = predict(X_dev, W, b)
train_accuracy = accuracy(y_train, y_pred_train)
test_accuracy = accuracy(y_dev, y_pred_dev)

print(f"Train Accuracy: {train_accuracy}")
print(f"Development Accuracy: {test_accuracy}")

Train Accuracy: 0.93484375
Development Accuracy: 0.9357291666666666


In [14]:
def calculate_precision(y_true, y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fp = np.sum((y_true==0) & (y_pred==1))
    return (tp / (tp+fp)) if (tp+fp) != 0 else 0

In [15]:
precision_dev = calculate_precision(y_dev, y_pred_dev)
print(f"Precision for Development Set: {precision_dev}")
precision_train = calculate_precision(y_train, y_pred_train)
print(f"Precision for Training Set: {precision_train}")

Precision for Development Set: 0.9298561151079137
Precision for Training Set: 0.9295192899200281


In [16]:
def calculate_recall(y_true, y_pred):
    tp = np.sum((y_true==1) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))
    return (tp / (tp+fn)) if (tp+fn) != 0 else 0

In [17]:
recall_dev = calculate_recall(y_dev, y_pred_dev)
print(f"Recall for Development Set: {recall_dev}")
recall_train = calculate_recall(y_train, y_pred_train)
print(f"Recall for Training Set: {recall_train}")

Recall for Development Set: 0.8596607914865314
Recall for Training Set: 0.8615296896635986


In [18]:
def calculate_f1score(precision, recall):
    return (2 * precision * recall) / (precision + recall)

In [19]:
f1score_train = calculate_f1score(precision_train, recall_train)
print(f"f1 for Training Set: {f1score_train}")

f1 for Training Set: 0.8942340209671964


In [20]:
f1score_dev = calculate_f1score(precision_dev, recall_dev)
print(f"f1 for Development Set: {f1score_dev}")

f1 for Development Set: 0.8933817176429929


FINAL RESULTS:

In [21]:
test_data = pd.read_csv("C:\\Users\\AK\\Downloads\\binary_classification_test.csv")
test_data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_11,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20
0,1,-21.263636,-101.510764,70.115563,-67.257828,60.603386,-14.047965,-20.077225,-39.171738,-109.037261,...,7.362312,-55.68929,11.794318,-192.876941,174.389915,-56.913898,-66.920817,136.880728,-53.746527,-88.624312
1,2,-28.646489,-104.090028,124.098974,-45.90678,85.691869,91.241763,-2.532164,-43.701116,-151.491278,...,-45.258211,-55.069378,11.297918,-212.560404,330.864084,-39.186926,-52.041948,155.040051,-43.390476,-85.584791
2,3,-34.084427,-149.471464,131.883193,96.372918,41.865528,-14.692982,25.448955,-56.218421,-179.342647,...,-29.19627,-34.475233,0.030514,-41.534695,-36.002752,-90.759938,5.575351,201.92991,-40.225772,-86.235677
3,4,-38.905587,-126.03893,86.480118,-76.173919,98.722116,15.411935,-35.391915,-23.142612,-109.017318,...,-49.361568,-103.689725,12.311099,-188.698902,229.626393,-35.44261,-72.384866,100.019033,-56.657687,-83.543328
4,5,-41.041415,-124.302646,89.54941,-98.98009,72.425692,15.907022,5.400643,-32.267154,-181.961829,...,-3.392246,-27.35356,13.073419,-138.289303,151.146083,-60.065501,-45.043483,196.572356,-58.595875,-81.748002


In [22]:
X_test = test_data.iloc[:, 1:]

In [23]:
X_test = (X_test - X_train_mean) / X_train_std

In [24]:
X_test = X_test.values

In [25]:
Z_test = np.dot(X_test, W) + b
A_test = predict(X_test, W, b, threshold=0.5)

In [26]:
test_df = pd.DataFrame(A_test)

In [27]:
test_data['PREDICTIONS'] = test_df

In [29]:
test_data.to_csv("BINARY CLASSIFICATION USING LOGISTIC REGRESSION RESULTS.csv")