In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns',None)

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [None]:
df=pd.read_csv('admission.csv')
df.head()

### Basic information

In [None]:
df.info()

- all the  features are numerical

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isna().sum()

- features has no null values.

In [None]:
# since,serial no. is of unique values ,dropped it.
df=df.drop(labels='Serial No.',axis=1)

# Exploratory Data Analysis

# Q1. a) Visualize the 10 random rows of the data set


In [None]:
sample=df.sample(n=10)
sample

In [None]:
plt.figure(figsize=(10,10))
sample.boxplot()
plt.show()

In [None]:
sns.pairplot(sample,diag_kind='kde',kind='reg')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.histplot(data=df,bins=100)
plt.show()

In [None]:
sns.heatmap(sample,annot=True,cmap='viridis')
plt.show()

# b) Generate the description for numeric variables

In [None]:
df.describe()

# c) Check the shape of the data set

In [None]:
df.shape

# d) Generate the correlation matrix

In [None]:
df.corr()

- CGPA and Chance of Admit has high correlation.

# e) Generate a correlogram

In [None]:
sns.pairplot(df,diag_kind='kde',kind='reg')
plt.show()

# Q.2	Find out the minimum and maximum values for GRE score.

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=df['GRE Score'].unique(),y=df['GRE Score'].value_counts())
plt.xlabel('GRE Score', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('GRE Score Count Distribution', fontsize=15)
plt.show()

From this Barplot,
- Min_value of GRE Score is 290 
- Max_value of GRE Score is 340

In [None]:
df['GRE Score'].min()

In [None]:
df['GRE Score'].max()

# Q.3	Find out the percentage of universities for each university rating.

In [None]:
df['University Rating'].value_counts(normalize=True)

# Q.4	Convert the target variable “Chance of Admit” to categorical having values 0 and 1,such that :

Students having the “Chance of Admit” value > 0.80, are assigned value 1, and
Students having the “Chance of Admit” value < 0.80, are assigned value 0
Where 0: Low chance of Admission and 1: High chance of admission.


In [None]:
df.info()

In [None]:
df['Chance of Admit']=np.where(df['Chance of Admit'].values<=0.80,0,1)
df.head()

In [None]:
df['Chance of Admit']=df['Chance of Admit'].astype('category')
df.info()

# EDA

###  prevalance rate of traget variable.

In [None]:
df['Chance of Admit']=pd.DataFrame(df['Chance of Admit'])
df['Chance of Admit'].value_counts(normalize=True)*100

- the target variable is balanced.

typecasting:

In [None]:
df['University Rating'].value_counts()

In [None]:
df['SOP'].value_counts()

In [None]:
df['LOR'].value_counts()

In [None]:
df['Research'].value_counts()

In [None]:
df['University Rating']=df['University Rating'].astype('category')
df['SOP']=df['SOP'].astype('category')
df['LOR']=df['LOR'].astype('category')
df['Research']=df['Research'].astype('category')

In [None]:
df.info()

In [None]:
x=df.iloc[:,:-1]
x.head()

In [None]:
y=df.iloc[:,-1]
y

In [None]:
vif=pd.DataFrame()
vif['Feature']=x.columns
vif['VIF'] = [variance_inflation_factor(x.values,i) for i in range(len(x.columns))]
vif

- GRE Score has hight multicollinearity value.

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

In [None]:
# prevalance rate of ytrain.
ytrain.value_counts(normalize=True)*100

In [None]:
# prevalance rate of ytest.
ytest.value_counts(normalize=True)*100

- Both ytrain and ytest are balanced.

### Feature Scaling. 

In [None]:
scale = MinMaxScaler()
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

In [None]:
xtrain

### Feature Importance using Random Forest

In [None]:
pd.DataFrame(data= RandomForestClassifier().fit(x,y).feature_importances_,index=x.columns
             ,columns=['Feature Importance']).plot.barh();

- CGPA is the most important feature for the target variable.
- Research is the least important feature for the target variable.

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x=df['CGPA'],y=y.values)
plt.xlabel('CGPA')
plt.ylabel('Chance of Admit')
plt.show()

# Logistic Regression Model

In [None]:
lo=LogisticRegression(solver='liblinear',max_iter=500,class_weight='balanced')

In [None]:
model1=lo.fit(xtrain,ytrain)

In [None]:
ytrain_pred_lo=lo.predict(xtrain)
ytrain_pred_lo

In [None]:
ytest_pred_lo=lo.predict(xtest)
ytest_pred_lo

In [None]:
def metrics(a,b):
    cm=confusion_matrix(a,b)
    print('--> Confusion Matrix:\n',cm)
    
    cr=classification_report(a,b)
    print('\n--> Classification Report:\n',cr)
    
    acc_score=accuracy_score(a,b)
    print('\n--> Accuracy Score:\n',acc_score)

### Logistic Regression Report.

In [None]:
metrics(ytrain,ytrain_pred_lo)

In [None]:
metrics(ytest,ytest_pred_lo)

In [None]:
crossval_lo=cross_val_score(lo,xtrain,ytrain,cv=10)
crossval_lo_mean=crossval_lo.mean()
print('cross_val_score for logistic: ',crossval_lo)
print('\nMean Score: ',crossval_lo_mean)

In [None]:
sns.boxplot(crossval_lo)

# Q.5	Build a Decision Tree classifier, to predict whether a student has a low or high chance of admission to a chosen university. Perform Hyperparameter Tuning to improve the accuracy of the model.

In [None]:
dt=DecisionTreeClassifier(max_depth=20,min_samples_split=2,ccp_alpha=0.01,criterion='entropy',max_features='auto',class_weight='balanced')

In [None]:
model2=dt.fit(xtrain,ytrain)

In [None]:
ytrain_pred_dt=dt.predict(xtrain)
ytrain_pred_dt

In [None]:
ytest_pred_dt=dt.predict(xtest)
ytest_pred_dt

### Decision Tree Report

In [None]:
metrics(ytrain,ytrain_pred_dt)

In [None]:
metrics(ytest,ytest_pred_dt)

In [None]:
crossval_dt=cross_val_score(dt,xtrain,ytrain,cv=10)
crossval_dt_mean=crossval_dt.mean()
print('cross_val_score for Decision Tree: ',crossval_dt)
print('\nMean Score: ',crossval_dt_mean)

In [None]:
sns.boxplot(crossval_dt)

# Q6. Build a Random Forest classifier, to predict whether a student has a low or high chance of admission to a chosen university.

In [None]:
rf=RandomForestClassifier(max_depth=15,min_samples_split=3,criterion='entropy',class_weight='balanced',max_features='auto', n_estimators=10)

In [None]:
model3=rf.fit(xtrain,ytrain)

In [None]:
ytrain_pred_rf=rf.predict(xtrain)
ytrain_pred_rf

In [None]:
ytest_pred_rf=rf.predict(xtest)
ytest_pred_rf

### Random Forest Report.

In [None]:
metrics(ytrain,ytrain_pred_rf)

In [None]:
metrics(ytest,ytest_pred_rf)

In [None]:
crossval_rf=cross_val_score(rf,xtrain,ytrain,cv=10)
crossval_rf_mean=crossval_rf.mean()
print('cross_val_score for Random Forest: ',crossval_rf)
print('\nMean Score: ',crossval_rf_mean)

In [None]:
sns.boxplot(crossval_rf)

# Q.7	Also use Ensemble Modelling techniques, to predict whether a student has a low or high chance of admission to a chosen university.

In [None]:
bg=BaggingClassifier(max_samples=100,n_estimators=20,max_features=6,random_state=42)

In [None]:
model4=bg.fit(xtrain,ytrain)

In [None]:
ytrain_pred_bg=bg.predict(xtrain)
ytrain_pred_bg

In [None]:
ytest_pred_bg=bg.predict(xtest)
ytest_pred_bg

### Bagging Classifier Report

In [None]:
metrics(ytrain,ytrain_pred_bg)

In [None]:
metrics(ytest,ytest_pred_bg)

In [None]:
crossval_bg=cross_val_score(bg,xtrain,ytrain,cv=10)
crossval_bg_mean=crossval_bg.mean()
print('cross_val_score for Random Forest: ',crossval_bg)
print('\nMean Score: ',crossval_bg_mean)

# Q.8	Compare all of the models and justify your choice about the optimum model.

- Here, since it 

In [None]:
crossval=pd.DataFrame({'logistic Accuracy':crossval_lo,'Decision Tree Accuracy':crossval_dt,'Random Forest Accuracy':crossval_rf,'Bagging Classifier Accuracy':crossval_bg})
crossval

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(data=crossval)
plt.show()

In [None]:
comparison=pd.DataFrame({'Models':['Logistic','Decision Tree','Random Forest','Bagging Classifier'],'Score':[crossval_dt_mean,crossval_dt_mean,crossval_rf_mean,crossval_bg_mean]})


comparison.sort_values(by='Score', ascending=False)

- Here, Accuracy would be the better metrics. so, the model with high Score would be the better Model.
- So, Random Forest would be the Better Model.