# Binary Predictor

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

## Importing Data

In [3]:
raw_data = pd.read_csv("../Datasets/2.02.Binary predictors.csv")
raw_data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [5]:
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'No': 0, 'Yes': 1})
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


## Declaring Variables

In [11]:
y = data['Admitted']
x1 = data[['SAT','Gender']]

## Regression

In [12]:
x = sm.add_constant(x1)

reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 10 Jan 2025",Pseudo R-squ.:,0.8249
Time:,21:16:42,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [13]:
# Given the same SAT Score, a female is e^1.9449 = 7 times more likely to get admitted than a male

In [19]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [20]:
cm = np.array(cm_df)
accuracy = (cm[0,0] + cm[1,1])/cm.sum()
accuracy

0.9464285714285714

### Testing Model

In [24]:
test = pd.read_csv("../Datasets/2.03.Test dataset.csv")

test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male


In [25]:
test["Admitted"] = test["Admitted"].map({"No": 0, "Yes": 1})
test['Gender'] = test["Gender"].map({"Male": 0, "Female": 1})

test.head()

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0


In [27]:
test_actual = test["Admitted"]
test_data = test.drop(["Admitted"], axis = 1)
test_data = sm.add_constant(test_data)
test_data

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


#### Confusion Matrix Function

In [28]:
def confusion_matrix(data, actual_value, model):
    pred_value = model.predict(data)
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_value, pred_value, bins = bins)[0]
    accuracy = (cm[0,0] + cm[1,1])/cm.sum()
    return cm, accuracy

In [29]:
cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[ 5.,  1.],
        [ 1., 12.]]),
 0.8947368421052632)

In [31]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: "Actual 0", 1: "Actual 1"})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0
