In [5]:
# Building a Machine Learning model to predict the admission in the university.
# Data obtained from https://www.kaggle.com/datasets/akshaydattatraykhare/data-for-admission-in-the-university

import pandas as pd

df = pd.read_csv("adm_data.csv")

df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [8]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,115.614301,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,100.75,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,200.5,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,300.25,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [12]:
# Dropping the useless column as it isn't the determinant of the outcome of the Chance of Admit.

df2 = df.drop(['Serial No.'], axis = 1, inplace = True)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [15]:
df['Chance of Admit ']

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
395    0.82
396    0.84
397    0.91
398    0.67
399    0.95
Name: Chance of Admit , Length: 400, dtype: float64

In [18]:
# Fixing the Chance of Admit column as it contains an extra space at the end of , "Chance of Admit "

df.rename(columns = {
    'Chance of Admit ':'Chance of Admit'
}, inplace=True)
df['Chance of Admit']

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
395    0.82
396    0.84
397    0.91
398    0.67
399    0.95
Name: Chance of Admit, Length: 400, dtype: float64

In [19]:
# Now creating input and output dataframes named x and y.

x = df.drop(["Chance of Admit"], axis = 1, inplace = False)

x.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,1,337,118,4,4.5,4.5,9.65,1
1,2,324,107,4,4.0,4.5,8.87,1
2,3,316,104,3,3.0,3.5,8.0,1
3,4,322,110,3,3.5,2.5,8.67,1
4,5,314,103,2,2.0,3.0,8.21,0


In [20]:
y = df["Chance of Admit"] 
y.head()

0    0.92
1    0.76
2    0.72
3    0.80
4    0.65
Name: Chance of Admit, dtype: float64

In [36]:
x.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
dtype: int64

In [43]:
print(x.columns)

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research'],
      dtype='object')


In [42]:
# Scaling th numerial field columns between 0 and 1.
# Scaling helps to create a uniformity before the creation of the algorithm.

from sklearn.preprocessing import MinMaxScaler

scalerX = MinMaxScaler(feature_range = (0,1))
x[x.columns] = scalerX.fit_transform(x[x.columns])

x.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,0.0,0.94,0.928571,0.75,0.875,0.875,0.913462,1.0
1,0.002506,0.68,0.535714,0.75,0.75,0.875,0.663462,1.0
2,0.005013,0.52,0.428571,0.5,0.5,0.625,0.384615,1.0
3,0.007519,0.64,0.642857,0.5,0.625,0.375,0.599359,1.0
4,0.010025,0.48,0.392857,0.25,0.25,0.5,0.451923,0.0


In [52]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [55]:
x_test.count()

Serial No.           80
GRE Score            80
TOEFL Score          80
University Rating    80
SOP                  80
LOR                  80
CGPA                 80
Research             80
dtype: int64

In [63]:
# Applying the first algorithm for regression 
# There are two types of algorithms, regression and classification;

from sklearn.linear_model import LinearRegression

model = LinearRegression()

# Fitting the x_train and y_train on the model

model.fit(x_train, y_train)

In [64]:
# Finding the score:

score = model.score(x_test, y_test)
print(score)

0.7791691309134295


In [75]:
# Finding gthe prediction

y_predict = model.predict(x_test)
print(y_predict[0:5])

[0.67817016 0.91215501 0.66520601 0.88387997 0.83990021]


In [76]:
df1 = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_predict
})

In [78]:
df1['Actual'].shape

(80,)

In [79]:
df1['Predicted'].shape

(80,)

In [85]:
# Finding the absolute error

df1['Error'] = df1['Actual'] - df1['Predicted']
# df1['Error'] = df1.abs
df1.to_csv('Output.csv')

In [82]:
df1.head()

Unnamed: 0,Actual,Predicted,Error
219,0.74,0.67817,0.06183
187,0.93,0.912155,0.017845
342,0.58,0.665206,-0.085206
176,0.9,0.88388,0.01612
97,0.86,0.8399,0.0201


In [83]:
# Finding the absolute mean error

absolute_mean_error = df1['Error'].mean()
print('Absolute Mean Error:', absolute_mean_error)

Absolute Mean Error: -0.0031932439845001348


In [86]:
from sklearn.metrics import mean_absolute_error

e = mean_absolute_error(y_test, y_predict)
print(e)

0.04733465274633346


In [95]:
# Deployment of the Mdoel

gre = int(input('INPUT THE GRE SCORE:'))
TOEFL = int(input('INPUT THE toefl SCORE:'))
UNIV = int(input('INPUT THE uNIVERSITY rATING SCORE:'))
SOP = int(input('INPUT THE SOP SCORE:'))
LOR  = int(input('INPUT THE LOR SCORE:'))
CGPA = int(input('INPUT THE CGPA SCORE:'))
Research = int(input('INPUT THE Research SCORE:'))

data = [[gre, TOEFL, UNIV, SOP, LOR, CGPA, Research]]
list = [gre, TOEFL, UNIV, SOP, LOR, CGPA, Research]

INPUT THE GRE SCORE:310
INPUT THE toefl SCORE:110
INPUT THE uNIVERSITY rATING SCORE:3
INPUT THE SOP SCORE:4
INPUT THE LOR SCORE:3
INPUT THE CGPA SCORE:8
INPUT THE Research SCORE:0


In [93]:
# SIR SKIPPED THIS PART

newdf = pd.DataFrame(data, columns = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research'])
newdf[newdf.columns] = scalerX.fit_transform(newdf[newdf.])


In [96]:
data

[[310, 110, 3, 4, 3, 8, 0]]

In [97]:
list

[310, 110, 3, 4, 3, 8, 0]

In [102]:
new_df = pd.DataFrame([list])
y_predict_new = model.predict(newdf)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- LOR
Feature names seen at fit time, yet now missing:
- LOR 
- Serial No.


In [None]:
# Applying 2nd algorithm for the regression

# Random Forst Regressor