# Importing the needed libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Loading the dataset

In [2]:
raw_data=pd.read_csv('2.01.+Admittance.csv')
data=raw_data.copy()
data.head()

Unnamed: 0,SAT,Admitted
0,1363,No
1,1792,Yes
2,1954,Yes
3,1653,No
4,1593,No


# Exploring the data

In [3]:
data.isnull().sum()

SAT         0
Admitted    0
dtype: int64

In [42]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   SAT       168 non-null    int64
 1   Admitted  168 non-null    uint8
dtypes: int64(1), uint8(1)
memory usage: 1.6 KB


# Creating a dummy variable to take care of the categorical feature

In [4]:
d=pd.get_dummies(data['Admitted'])
d=pd.concat([data,d],axis=1)
d.drop(['Admitted','No'],inplace=True,axis=1)
d['Admitted']=d['Yes']
d.drop(['Yes'],inplace=True,axis=1)
d.head()

Unnamed: 0,SAT,Admitted
0,1363,0
1,1792,1
2,1954,1
3,1653,0
4,1593,0


# Declaring the dependent and independent variable

In [6]:
x=d[['SAT']]
y=d['Admitted']

# Standardizing the data

In [7]:
scaler=StandardScaler()
scaler.fit(x)

StandardScaler()

In [8]:
scaled_inputs=scaler.transform(x)

# Splitting the data into train and test

In [9]:
x_train,x_test,y_train,y_test=train_test_split(scaled_inputs,y,train_size=0.8,random_state=42)

# Fitting the model

In [10]:
from sklearn import metrics
reg=LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression()

# Checking the accuracy of the model

## Logistic Regression

In [11]:
reg.score(x_train,y_train)

0.917910447761194

In [12]:
y_predict=reg.predict(x_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

0.9117647058823529

# Summary table

In [14]:
feature_name=x.columns.values
summary_table=pd.DataFrame(columns=['Feature name'],data=feature_name)
summary_table['Coefficient']=np.transpose(reg.coef_)
summary_table['Odds Ratio']=np.exp(summary_table.Coefficient)
#summary_table.sort_values(['Odds Ratio'],ascending=False)

In [15]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds Ratio
0,SAT,3.655564,38.689321


# Testing the model

In [16]:
reg.score(x_test,y_test)

0.9117647058823529

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89        14
           1       0.90      0.95      0.93        20

    accuracy                           0.91        34
   macro avg       0.91      0.90      0.91        34
weighted avg       0.91      0.91      0.91        34



# Confusion matrix

In [18]:
from sklearn.metrics import confusion_matrix
PRED=pd.DataFrame(confusion_matrix(y_test, y_predict))
PRED.columns=['Predicted 0','Predicted 1']
PRED=PRED.rename(index={0:'Actual 0',1:'Actual 1'})
PRED

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12,2
Actual 1,1,19


## Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
ml=DecisionTreeClassifier()
ml.fit(x_train,y_train)

DecisionTreeClassifier()

In [38]:
ml.score(x_train,y_train)

0.9776119402985075

In [39]:
y_predict_2=ml.predict(x_test)
accuracy_score(y_test,y_predict_2)

0.9117647058823529

## Support Vector Machine

In [31]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(x_train,y_train)

SVC()

In [32]:
svc.score(x_train,y_train)

0.9328358208955224

In [33]:
y_predict_3=svc.predict(x_test)
accuracy_score(y_test,y_predict_3)

0.9117647058823529

# Conclusion

Logistic regression model, SVM and the Decision Tree classifier all have an accuracy of 91.2%. It can be said that a higher SAT score quarantees a higher admission rate