In [1]:
# Loading Libraries
import pandas as pd                    # data analysis
import numpy as np                     # scientific calculations
import seaborn as sns                  # Statistical process
import matplotlib.pyplot as plt        # plotting
import statsmodels.api as sm

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
# Importing data
pf_data = pd.read_csv(r'C:\Users\sandeep\Desktop\FTN\ML-Statastic\Data\Student_Pass_Fail.csv')

In [4]:
# Understanding data
print(pf_data)

     Self_Study_Daily  Tution_Monthly  Pass_Or_Fail
0                   7              27             1
1                   2              43             0
2                   7              26             1
3                   8              29             1
4                   3              42             0
..                ...             ...           ...
995                 6              22             1
996                 9              30             1
997                 3              39             0
998                 7              25             1
999                 5              37             0

[1000 rows x 3 columns]


In [5]:
# Understanding data
pf_data.head()

Unnamed: 0,Self_Study_Daily,Tution_Monthly,Pass_Or_Fail
0,7,27,1
1,2,43,0
2,7,26,1
3,8,29,1
4,3,42,0


In [6]:
# Count the occurrences of each unique value in the dependent variable
value_counts = pf_data['Pass_Or_Fail'].value_counts()

# Print the counts
print("Count of observations by dependent variable:")
print(value_counts)

Count of observations by dependent variable:
Pass_Or_Fail
0    501
1    499
Name: count, dtype: int64


In [13]:
# Check for missing values and removing it
print(pf_data.isnull().sum())

pf_data.dropna(inplace=True)

Self_Study_Daily    0
Tution_Monthly      0
Pass_Or_Fail        0
dtype: int64


In [15]:
# Encoding categorical variables
data = pd.get_dummies(pf_data, drop_first=True,dtype=float)

In [17]:
# Defining Independent and dependent variables
x = pf_data.drop('Pass_Or_Fail',axis = 1)
y = pf_data.Pass_Or_Fail

In [19]:
# Splitting data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=4)

In [20]:
# Using statsmodels for detailed model summary

x_train_sm = sm.add_constant(x_train)
logit_model = sm.Logit(y_train, x_train_sm)
result = logit_model.fit()

# Summary of the logistic regression model
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.102716
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:           Pass_Or_Fail   No. Observations:                  700
Model:                          Logit   Df Residuals:                      697
Method:                           MLE   Df Model:                            2
Date:                Sun, 09 Feb 2025   Pseudo R-squ.:                  0.8518
Time:                        10:44:42   Log-Likelihood:                -71.901
converged:                       True   LL-Null:                       -485.20
Covariance Type:            nonrobust   LLR p-value:                3.210e-180
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                4.6803      2.474      1.892      0.059      -0.168       9.529
Self_Study

Both predictors are statistically significant, showing that self-study positively contributes to passing while high tuition costs are associated with a lower likelihood of passing.

The logistic regression coefficients represent the change in the log-odds of passing for a one-unit change in the predictor. To interpret in terms of probability, these would need to be exponentiated.

Odds Ratio for Self_Study_Daily:
𝑒^2.6325 ≈ 13.91
Students who study one hour more daily are 13.91 times more likely to pass.

Odds Ratio for Tution_Monthly:
𝑒^−0.7794 ≈ 0.458
An increase in tuition fees by one unit reduces the odds of passing by about 54.2%.

In [24]:
y_pred = result.predict(sm.add_constant(x_test))

y_pred

698    9.804718e-01
577    9.449318e-01
763    9.583830e-01
790    5.528455e-07
520    9.739660e-01
           ...     
53     9.583830e-01
282    9.739660e-01
656    2.627915e-06
995    9.653695e-01
50     1.823133e-08
Length: 300, dtype: float64

In [None]:
y_pred_binary = (y_pred >= 0.5).astype(int)
y_pred_binary

In [None]:
#  Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_binary)

         Predicted
          0    1
Actual 0  TN   FP
       1  FN   TP
TN (True Negative): The number of true negative predictions (actual class 0, predicted class 0)
FP (False Positive): The number of false positive predictions (actual class 0, predicted class 1)
FN (False Negative): The number of false negative predictions (actual class 1, predicted class 0)
TP (True Positive): The number of true positive predictions (actual class 1, predicted class 1)

In [None]:
# Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_binary)

In [None]:
# ROC Curve and AUC
from sklearn.metrics import roc_curve, roc_auc_score
#y_prob = logistic_regression.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc:.2f}')

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

ROC Curve (Receiver Operating Characteristic Curve): This is a plot of the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings. The ROC curve illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied.

AUC (Area Under the Curve): This metric summarizes the ROC curve into a single value representing the likelihood that the model will rank a randomly chosen positive instance higher than a randomly chosen negative one.

AUC = 0.5: No discrimination (random guessing)
0.5 < AUC < 0.7: Poor model
0.7 ≤ AUC < 0.8: Fair model
0.8 ≤ AUC < 0.9: Good model
0.9 ≤ AUC < 1: Excellent model

In [None]:
# Prediction on new data

New_students={'Self_Study_Daily':[8,5,10],'Tution_Monthly':[20,30,60]}
df2 = pd.DataFrame(New_students,columns= ['Self_Study_Daily', 'Tution_Monthly'])

# Add constant to the new data (df2)
df2_sm = sm.add_constant(df2)

# Make predictions on the new data
y_pred_new = result.predict(df2_sm)

# Display the predictions
print(y_pred_new)

y_pred_binary_new = (y_pred_new >= 0.5).astype(int)
y_pred_binary_new

In [None]:
# Checking Assumptions: Linearity
# Create a copy of the training data
x_train_trans = x_train.copy()

# Add log-transformed terms for continuous predictors
continuous_vars = ['Self_Study_Daily', 'Tution_Monthly']  # Replace with your actual column names
for var in continuous_vars:
    x_train_trans[f'log_{var}'] = np.log(x_train[var] + 1)  # Add 1 to avoid log(0)
    x_train_trans[f'interaction_{var}'] = x_train[var] * x_train_trans[f'log_{var}']

# Add constant column
x_train_trans = sm.add_constant(x_train_trans)

logit_model_interaction = sm.Logit(y_train, x_train_trans)
result_interaction = logit_model_interaction.fit()

# Print summary to check the significance of interaction terms
print(result_interaction.summary())

If the interaction term is significant, the relationship may not be linear, and you might need a transformation or nonlinear model.

In [None]:
# Checking Assumption: Multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each variable in x_train_sm
vif_data = pd.DataFrame()
vif_data["Variable"] = x_train_sm.columns
vif_data["VIF"] = [variance_inflation_factor(x_train_sm.values, i) for i in range(x_train_sm.shape[1])]

# Print the VIF results
print(vif_data)

Rule of Thumb: VIF > 10 indicates multicollinearity that may require attention.

In [None]:
# Check class distribution
print(pf_data['Pass_Or_Fail'].value_counts(normalize=True))

If the classes are imbalanced, consider resampling techniques such as oversampling the minority class or using weighted models.