# This project predicts the admission of a student based on different features including university rating, student’s undergrade GPA, GRE score, research experience and etc. This predicts that how much chances are there that the student will get admission in his selected university or not.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [6]:
df['Research'].value_counts()

1    280
0    220
Name: Research, dtype: int64

In [7]:
df.drop('Serial No.', axis=1, inplace=True)
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


# EDA process

In [9]:
df.info() # Get to know about data 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          500 non-null    int64  
 1   TOEFL Score        500 non-null    int64  
 2   University Rating  500 non-null    int64  
 3   SOP                500 non-null    float64
 4   LOR                500 non-null    float64
 5   CGPA               500 non-null    float64
 6   Research           500 non-null    int64  
 7   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB


In [8]:
df.isnull().sum()  # there is no null values

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [10]:
# Some statistics
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [12]:
df.corr() # we can see all the features is important in prediction  of chances of admit  

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
GRE Score,1.0,0.8272,0.635376,0.613498,0.524679,0.825878,0.563398,0.810351
TOEFL Score,0.8272,1.0,0.649799,0.64441,0.541563,0.810574,0.467012,0.792228
University Rating,0.635376,0.649799,1.0,0.728024,0.608651,0.705254,0.427047,0.690132
SOP,0.613498,0.64441,0.728024,1.0,0.663707,0.712154,0.408116,0.684137
LOR,0.524679,0.541563,0.608651,0.663707,1.0,0.637469,0.372526,0.645365
CGPA,0.825878,0.810574,0.705254,0.712154,0.637469,1.0,0.501311,0.882413
Research,0.563398,0.467012,0.427047,0.408116,0.372526,0.501311,1.0,0.545871
Chance of Admit,0.810351,0.792228,0.690132,0.684137,0.645365,0.882413,0.545871,1.0


In [16]:
d=df.groupby(by = 'University Rating').mean()  # we can see here that as university rating increses chance of admission incresaes
d

Unnamed: 0_level_0,GRE Score,TOEFL Score,SOP,LOR,CGPA,Research,Chance of Admit
University Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,304.911765,100.205882,1.941176,2.426471,7.798529,0.294118,0.562059
2,309.134921,103.444444,2.68254,2.956349,8.177778,0.293651,0.626111
3,315.030864,106.314815,3.308642,3.401235,8.500123,0.537037,0.702901
4,323.304762,110.961905,4.0,3.947619,8.936667,0.780952,0.801619
5,327.890411,113.438356,4.479452,4.40411,9.278082,0.876712,0.888082


# Data preparation

In [17]:
X = df.drop(columns=['Chance of Admit'])
y = df['Chance of Admit']

In [18]:
X.shape

(500, 7)

In [19]:
y.shape


(500,)

In [22]:
X = np.array(X)
y = np.array(y)


In [23]:
y = y.reshape(-1,1)
y.shape

(500, 1)

In [20]:
# Scaling data before training the model
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)

In [24]:
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)

In [25]:
# split the data into train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Building model using multiple Regression 

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score , r2_score


In [28]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [29]:
accuracy_score = model.score(X_test, y_test)
accuracy_score

0.8781781272549312

# Building model using Decision tree Regression 

In [30]:
from sklearn.tree import DecisionTreeRegressor
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)

DecisionTreeRegressor()

In [32]:
accuracy_decisionTree = model_tree.score(X_test, y_test)
accuracy_decisionTree

0.6463780225543163

# Building model using Rnadom Forest Regression 


In [33]:
from sklearn.ensemble import RandomForestRegressor
randomForest_model = RandomForestRegressor(n_estimators=100, max_depth=10)
randomForest_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10)

In [34]:
accuracy_randomforest = randomForest_model.score(X_test, y_test)
accuracy_randomforest

0.8297759986512457

In [35]:
# we are getting the best accuracy by using Multiple Linear Regression

In [36]:
import pickle

In [37]:
pickle.dump(model,open('model.pkl', 'wb'))

In [38]:
model = pickle.load(open('model.pkl', 'rb'))


In [43]:
chance=model.predict([[320, 110, 1, 5, 5, 9, 1]])
chance[0][0]/100 

0.5896120647874981