### Load Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df=pd.read_csv("./dataset/engineering.csv")
df.sample(6)

Unnamed: 0,institute_id,name,tlr,rpc,go,oi,perception,city,state,rank
52,IR-E-U-0395,Indian Institute of Technology Jodhpur,67.71,26.19,56.17,59.29,8.05,Jodhpur,Rajasthan,53
192,IR-E-U-0129,Dharmsinh Desai University,49.4,2.29,59.12,36.87,2.69,Nadiad,Gujarat,193
26,IR-E-U-0334,Visvesvaraya National Institute of Technology,67.7,45.19,69.73,53.77,15.56,Nagpur,Maharashtra,27
96,IR-E-C-18154,University College of Engineering,58.25,10.41,59.54,50.97,1.63,Kakinada,Andhra Pradesh,97
72,IR-E-C-1262,B M S College of Engineering,62.07,11.92,55.53,52.58,17.79,Bengaluru,Karnataka,73
181,IR-E-U-0455,Indian Institute of Information Technology De...,35.51,7.8,57.62,53.05,18.15,Chennai,Tamil Nadu,182


In [3]:
df.shape

(200, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   institute_id  200 non-null    object 
 1   name          200 non-null    object 
 2   tlr           200 non-null    float64
 3   rpc           200 non-null    float64
 4   go            200 non-null    float64
 5   oi            200 non-null    float64
 6   perception    200 non-null    float64
 7   city          200 non-null    object 
 8   state         200 non-null    object 
 9   rank          200 non-null    int64  
dtypes: float64(5), int64(1), object(4)
memory usage: 15.8+ KB


In [5]:
df.isnull().sum()

institute_id    0
name            0
tlr             0
rpc             0
go              0
oi              0
perception      0
city            0
state           0
rank            0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,tlr,rpc,go,oi,perception,rank
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,59.05015,20.30735,56.7036,51.8737,14.21465,100.475
std,10.152778,19.937026,11.10281,6.853437,19.262625,57.866936
min,35.51,0.46,13.06,33.8,0.0,1.0
25%,52.535,5.3975,50.1125,47.36,2.5575,50.75
50%,57.525,13.35,55.07,51.855,6.65,100.5
75%,64.3925,30.6425,63.095,56.0025,17.79,150.25
max,95.42,96.15,89.65,75.7,100.0,200.0


In [7]:
df.duplicated().sum()

0

In [8]:
df.corr()

  df.corr()


Unnamed: 0,tlr,rpc,go,oi,perception,rank
tlr,1.0,0.644496,0.433223,0.441925,0.685677,-0.743884
rpc,0.644496,1.0,0.610339,0.229635,0.812273,-0.798349
go,0.433223,0.610339,1.0,0.274123,0.670738,-0.655488
oi,0.441925,0.229635,0.274123,1.0,0.266696,-0.426902
perception,0.685677,0.812273,0.670738,0.266696,1.0,-0.693651
rank,-0.743884,-0.798349,-0.655488,-0.426902,-0.693651,1.0


In [9]:
df.corr()['rank']

  df.corr()['rank']


tlr          -0.743884
rpc          -0.798349
go           -0.655488
oi           -0.426902
perception   -0.693651
rank          1.000000
Name: rank, dtype: float64

In [10]:
clean_df=df.drop(["institute_id","name","city","state"],axis=True)
clean_df.sample(6)

Unnamed: 0,tlr,rpc,go,oi,perception,rank
149,53.05,6.12,52.87,46.82,5.21,150
163,57.13,2.86,49.94,45.99,2.16,164
196,46.11,7.36,50.73,48.76,0.55,197
48,69.35,19.73,52.14,48.69,51.77,49
193,41.95,12.14,53.25,43.08,0.55,194
143,41.84,24.72,48.59,39.99,0.0,144


### Split Data

In [11]:
X = clean_df.drop('rank', axis=1)
y = clean_df['rank']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (200, 5)
Shape of y =  (200,)


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (160, 5)
Shape of y_train =  (160,)
Shape of X_test =  (40, 5)
Shape of y_test =  (40,)


### Model Traning with Support Vector Regressor

In [13]:
from sklearn.svm import SVR

In [14]:
#Model Traning with ernel='linear'
svr_linear = SVR(kernel='linear')
svr_linear.fit(X_train, y_train)
svr_linear.score(X_test, y_test)

0.802427660287366

In [15]:
y_pred=svr_linear.predict(X_test)

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  691.1691682573924
RMSE =  26.290096391177276


### Model Traning with Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
regressorRFR = RandomForestRegressor(n_estimators=100, criterion='squared_error')
regressorRFR.fit(X_train, y_train)

In [20]:
regressorRFR.score(X_test, y_test)

0.9322706585949105

In [21]:
y_pred2=regressorRFR.predict(X_test)

In [22]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred2)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  236.93819
RMSE =  15.392796691959521


In [30]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(regressorRFR, X_train, y_train, cv=5, ).mean()

0.9229691852648199

In [23]:
int(regressorRFR.predict([X_test.iloc[18, :]])[0].round())



119

In [24]:
y_test.iloc[18]

129

### Save the Model

In [25]:
import joblib
#joblib.dump(regressorRFR, "college_rank_predictor.pkl")

In [26]:
model = joblib.load("college_rank_predictor.pkl")

In [27]:
model.predict([X_test.iloc[18, :]])[0]



120.02