In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Grades.csv')
df

Unnamed: 0,Seat No.,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
0,CS-97001,B-,D+,C-,C,C-,D+,D,C-,B-,...,C-,C-,C-,C-,A-,A,C-,B,A-,2.205
1,CS-97002,A,D,D+,D,B-,C,D,A,D+,...,D+,D,C,D,A-,B-,C,C,B,2.008
2,CS-97003,A,B,A,B-,B+,A,B-,B+,A-,...,B,B,A,C,A,A,A,A-,A,3.608
3,CS-97004,D,C+,D+,D,D,A-,D+,C-,D,...,D+,C,D+,C-,B-,B,C+,C+,C+,1.906
4,CS-97005,A-,A-,A-,B+,A,A,A-,B+,A,...,B-,B+,B+,B-,A-,A,A-,A-,A,3.448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,CS-97567,B,A,A,A-,A+,A,A-,A-,A+,...,A-,A-,A,A,A,B+,B+,B,A,3.798
567,CS-97568,A+,A,A,A,A,A,A,A-,A,...,B+,B+,A,A,A-,B,A-,C,A-,3.772
568,CS-97569,B,A,A-,B+,A,A,A,A,A,...,A-,B,A,B+,A,C,B+,A-,A-,3.470
569,CS-97570,A,B+,D,A,D,D+,B-,C-,B-,...,D,B,B,C-,D,C,B,B-,C,2.193


Looks like all are categorical columns except CGPA

In [3]:
# Let's confirm again
df.dtypes

Seat No.      object
PH-121        object
HS-101        object
CY-105        object
HS-105/12     object
MT-111        object
CS-105        object
CS-106        object
EL-102        object
EE-119        object
ME-107        object
CS-107        object
HS-205/20     object
MT-222        object
EE-222        object
MT-224        object
CS-210        object
CS-211        object
CS-203        object
CS-214        object
EE-217        object
CS-212        object
CS-215        object
MT-331        object
EF-303        object
HS-304        object
CS-301        object
CS-302        object
TC-383        object
MT-442        object
EL-332        object
CS-318        object
CS-306        object
CS-312        object
CS-317        object
CS-403        object
CS-421        object
CS-406        object
CS-414        object
CS-419        object
CS-423        object
CS-412        object
CGPA         float64
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 43 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seat No.   571 non-null    object 
 1   PH-121     571 non-null    object 
 2   HS-101     571 non-null    object 
 3   CY-105     570 non-null    object 
 4   HS-105/12  570 non-null    object 
 5   MT-111     569 non-null    object 
 6   CS-105     571 non-null    object 
 7   CS-106     569 non-null    object 
 8   EL-102     569 non-null    object 
 9   EE-119     569 non-null    object 
 10  ME-107     569 non-null    object 
 11  CS-107     569 non-null    object 
 12  HS-205/20  566 non-null    object 
 13  MT-222     566 non-null    object 
 14  EE-222     564 non-null    object 
 15  MT-224     564 non-null    object 
 16  CS-210     564 non-null    object 
 17  CS-211     566 non-null    object 
 18  CS-203     566 non-null    object 
 19  CS-214     565 non-null    object 
 20  EE-217    

In [5]:
#Dropping the nominal data
df.drop('Seat No.', axis = 1, inplace = True)

In [6]:
#check for nulls in the dataset
df.isnull().sum()

PH-121        0
HS-101        0
CY-105        1
HS-105/12     1
MT-111        2
CS-105        0
CS-106        2
EL-102        2
EE-119        2
ME-107        2
CS-107        2
HS-205/20     5
MT-222        5
EE-222        7
MT-224        7
CS-210        7
CS-211        5
CS-203        5
CS-214        6
EE-217        6
CS-212        6
CS-215        6
MT-331        9
EF-303       10
HS-304       10
CS-301       10
CS-302       10
TC-383       10
MT-442       10
EL-332        9
CS-318        9
CS-306        9
CS-312       10
CS-317       12
CS-403       12
CS-421       12
CS-406       85
CS-414       13
CS-419       13
CS-423       14
CS-412       79
CGPA          0
dtype: int64

CGPA has no null values and it is the target column. So seperate it from all features.

In [7]:
X = df.drop('CGPA', axis = 1)
y = df['CGPA']


In [8]:
X 

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
0,B-,D+,C-,C,C-,D+,D,C-,B-,C-,...,C,C-,C-,C-,C-,A-,A,C-,B,A-
1,A,D,D+,D,B-,C,D,A,D+,D,...,D,D+,D,C,D,A-,B-,C,C,B
2,A,B,A,B-,B+,A,B-,B+,A-,A-,...,A-,B,B,A,C,A,A,A,A-,A
3,D,C+,D+,D,D,A-,D+,C-,D,C+,...,C-,D+,C,D+,C-,B-,B,C+,C+,C+
4,A-,A-,A-,B+,A,A,A-,B+,A,A-,...,A-,B-,B+,B+,B-,A-,A,A-,A-,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,B,A,A,A-,A+,A,A-,A-,A+,B+,...,B+,A-,A-,A,A,A,B+,B+,B,A
567,A+,A,A,A,A,A,A,A-,A,A,...,A-,B+,B+,A,A,A-,B,A-,C,A-
568,B,A,A-,B+,A,A,A,A,A,B,...,B+,A-,B,A,B+,A,C,B+,A-,A-
569,A,B+,D,A,D,D+,B-,C-,B-,C-,...,B+,D,B,B,C-,D,C,B,B-,C


In [9]:
y

0      2.205
1      2.008
2      3.608
3      1.906
4      3.448
       ...  
566    3.798
567    3.772
568    3.470
569    2.193
570    1.753
Name: CGPA, Length: 571, dtype: float64

Let's fill the nulls using SimpleImputer

In [10]:
X.shape

(571, 41)

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
imputer = SimpleImputer(strategy="most_frequent")

imp_X = imputer.fit_transform(X)
imp_X

array([['B-', 'D+', 'C-', ..., 'C-', 'B', 'A-'],
       ['A', 'D', 'D+', ..., 'C', 'C', 'B'],
       ['A', 'B', 'A', ..., 'A', 'A-', 'A'],
       ...,
       ['B', 'A', 'A-', ..., 'B+', 'A-', 'A-'],
       ['A', 'B+', 'D', ..., 'B', 'B-', 'C'],
       ['C', 'D', 'D', ..., 'B+', 'D', 'C-']], dtype=object)

In [13]:
cols = X.columns.values.tolist()
cols

['PH-121',
 'HS-101',
 'CY-105',
 'HS-105/12',
 'MT-111',
 'CS-105',
 'CS-106',
 'EL-102',
 'EE-119',
 'ME-107',
 'CS-107',
 'HS-205/20',
 'MT-222',
 'EE-222',
 'MT-224',
 'CS-210',
 'CS-211',
 'CS-203',
 'CS-214',
 'EE-217',
 'CS-212',
 'CS-215',
 'MT-331',
 'EF-303',
 'HS-304',
 'CS-301',
 'CS-302',
 'TC-383',
 'MT-442',
 'EL-332',
 'CS-318',
 'CS-306',
 'CS-312',
 'CS-317',
 'CS-403',
 'CS-421',
 'CS-406',
 'CS-414',
 'CS-419',
 'CS-423',
 'CS-412']

In [14]:
X = pd.DataFrame(data = imp_X, columns = cols)
X

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
0,B-,D+,C-,C,C-,D+,D,C-,B-,C-,...,C,C-,C-,C-,C-,A-,A,C-,B,A-
1,A,D,D+,D,B-,C,D,A,D+,D,...,D,D+,D,C,D,A-,B-,C,C,B
2,A,B,A,B-,B+,A,B-,B+,A-,A-,...,A-,B,B,A,C,A,A,A,A-,A
3,D,C+,D+,D,D,A-,D+,C-,D,C+,...,C-,D+,C,D+,C-,B-,B,C+,C+,C+
4,A-,A-,A-,B+,A,A,A-,B+,A,A-,...,A-,B-,B+,B+,B-,A-,A,A-,A-,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,B,A,A,A-,A+,A,A-,A-,A+,B+,...,B+,A-,A-,A,A,A,B+,B+,B,A
567,A+,A,A,A,A,A,A,A-,A,A,...,A-,B+,B+,A,A,A-,B,A-,C,A-
568,B,A,A-,B+,A,A,A,A,A,B,...,B+,A-,B,A,B+,A,C,B+,A-,A-
569,A,B+,D,A,D,D+,B-,C-,B-,C-,...,B+,D,B,B,C-,D,C,B,B-,C


confirm if all the nulls are imputed 

In [15]:
X.isnull().sum()

PH-121       0
HS-101       0
CY-105       0
HS-105/12    0
MT-111       0
CS-105       0
CS-106       0
EL-102       0
EE-119       0
ME-107       0
CS-107       0
HS-205/20    0
MT-222       0
EE-222       0
MT-224       0
CS-210       0
CS-211       0
CS-203       0
CS-214       0
EE-217       0
CS-212       0
CS-215       0
MT-331       0
EF-303       0
HS-304       0
CS-301       0
CS-302       0
TC-383       0
MT-442       0
EL-332       0
CS-318       0
CS-306       0
CS-312       0
CS-317       0
CS-403       0
CS-421       0
CS-406       0
CS-414       0
CS-419       0
CS-423       0
CS-412       0
dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder() 

Let's encode all columns

In [17]:
X.columns

Index(['PH-121', 'HS-101', 'CY-105', 'HS-105/12', 'MT-111', 'CS-105', 'CS-106',
       'EL-102', 'EE-119', 'ME-107', 'CS-107', 'HS-205/20', 'MT-222', 'EE-222',
       'MT-224', 'CS-210', 'CS-211', 'CS-203', 'CS-214', 'EE-217', 'CS-212',
       'CS-215', 'MT-331', 'EF-303', 'HS-304', 'CS-301', 'CS-302', 'TC-383',
       'MT-442', 'EL-332', 'CS-318', 'CS-306', 'CS-312', 'CS-317', 'CS-403',
       'CS-421', 'CS-406', 'CS-414', 'CS-419', 'CS-423', 'CS-412'],
      dtype='object')

In [18]:
for i in X.columns:
    X[i]= lab_enc.fit_transform(X[i])
X    

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
0,5,10,8,6,8,10,9,8,5,8,...,6,8,8,8,8,2,0,8,3,2
1,0,9,10,9,5,6,9,0,10,9,...,9,10,9,6,9,2,5,6,6,3
2,0,3,0,5,4,0,5,4,2,2,...,2,3,3,0,6,0,0,0,2,0
3,9,7,10,9,9,2,10,8,9,7,...,8,10,6,10,8,5,3,7,7,7
4,2,2,2,4,0,0,2,4,0,2,...,2,5,4,4,5,2,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,3,0,0,2,1,0,2,2,1,4,...,4,2,2,0,0,0,4,4,3,0
567,1,0,0,0,0,0,0,2,0,0,...,2,4,4,0,0,2,3,2,6,2
568,3,0,2,4,0,0,0,0,0,3,...,4,2,3,0,4,0,6,4,2,2
569,0,4,9,0,9,10,5,8,5,8,...,4,9,3,3,8,9,6,3,5,6


In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=420)

In [20]:
X_train

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
453,4,9,0,0,5,0,5,5,0,2,...,9,3,7,3,12,12,8,6,5,12
320,9,10,9,6,6,9,3,9,9,10,...,7,0,6,6,5,4,0,2,3,2
281,12,10,11,11,11,9,2,9,11,11,...,2,1,5,0,3,2,0,2,2,2
131,2,8,4,5,10,2,8,4,2,2,...,3,8,5,6,6,2,0,5,5,2
108,2,6,0,2,4,2,10,2,0,2,...,4,8,3,5,5,2,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,2,2,0,4,5,0,2,3,6,6,...,0,0,2,2,5,0,2,0,2,0
154,2,2,2,5,3,2,5,2,4,9,...,2,10,6,10,7,4,0,8,5,3
185,0,4,2,0,2,2,2,5,5,2,...,0,2,0,0,5,2,2,2,2,2
390,6,6,3,5,4,5,5,4,6,8,...,7,1,6,2,5,2,0,3,2,2


In [21]:
y_train

453    2.935
320    2.266
281    1.092
131    2.823
108    3.195
       ...  
266    3.540
154    2.603
185    3.585
390    2.750
72     3.458
Name: CGPA, Length: 456, dtype: float64

In [22]:
X_test

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
439,4,7,0,2,4,2,5,2,2,3,...,6,3,6,0,10,5,5,3,6,5
264,0,8,2,1,2,1,0,0,2,2,...,0,2,0,0,2,0,0,2,2,2
121,7,3,5,8,10,5,7,7,5,4,...,8,8,9,10,9,2,4,6,8,6
96,0,3,1,3,0,1,0,0,0,0,...,0,2,0,0,2,0,1,0,0,0
250,0,2,2,2,2,1,2,2,5,0,...,3,3,3,6,5,2,2,2,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0,5,4,0,2,0,0,4,4,2,...,2,6,7,4,5,2,2,2,5,0
550,5,3,0,6,0,2,10,2,9,9,...,11,0,6,4,5,6,10,6,11,11
0,5,10,8,6,8,10,9,8,5,8,...,6,8,8,8,8,2,0,8,3,2
427,1,0,1,0,1,0,0,0,0,0,...,0,1,1,0,1,1,2,0,1,0


In [23]:
y_test

439    3.019
264    3.755
121    2.028
96     3.919
250    3.328
       ...  
177    3.369
550    2.359
0      2.205
427    3.985
381    2.800
Name: CGPA, Length: 115, dtype: float64

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [25]:
lr = LinearRegression() 

In [26]:
lr.fit(X_train, y_train)

In [27]:
lr.score(X_train, y_train)

0.9296336005488393

In [28]:
y_lr_pred = lr.predict(X_test)
y_lr_pred

array([3.07692974, 3.79040901, 2.00355206, 4.02175198, 3.35714238,
       2.24774271, 3.26059021, 2.84996546, 3.21453028, 3.55972606,
       2.84680234, 3.41421439, 3.53185643, 2.5521171 , 3.90072032,
       1.97711564, 3.09713581, 3.98776826, 3.71111935, 2.49392893,
       2.67759432, 2.96904649, 3.02436245, 2.44118947, 1.96942848,
       2.58251538, 3.59857067, 2.93674335, 2.23340758, 2.98653503,
       3.40717026, 2.5565389 , 2.01489466, 2.00470383, 3.79261911,
       2.46843534, 3.04200998, 2.53480921, 3.32757012, 2.96451154,
       2.78947051, 2.81150127, 3.05987132, 2.93573799, 2.5040355 ,
       1.97578558, 3.71317009, 2.94962576, 2.76994886, 3.26541046,
       2.08364714, 3.012055  , 2.35164032, 3.84116974, 2.89437858,
       3.41088229, 3.33245343, 3.17553155, 3.68900536, 2.74103121,
       3.77551664, 3.35028876, 3.79742859, 1.91774901, 3.00928948,
       2.54109901, 3.12633566, 2.53070292, 3.13471716, 3.3193351 ,
       3.13020421, 3.12125936, 3.58751786, 2.6415814 , 3.16644

Let's check if LinearRegression model overfitting or not using Lasso and Ridge regression

In [29]:
from sklearn.metrics import r2_score, mean_squared_error

In [30]:
print('R2 score for linear regression model:', r2_score(y_test, y_lr_pred))

R2 score for linear regression model: 0.9288691234090359


In [31]:
#lr.score() and r2_score() both are similar
lr.score(X_test,y_test)

0.9288691234090359

In [32]:
from sklearn.linear_model import Lasso, LassoCV,Ridge, RidgeCV  

In [33]:
LassCV = LassoCV()
LassCV

In [34]:
LassCV.fit(X_train, y_train)

In [35]:
alpha = LassCV.alpha_
alpha

0.016898424925371153

In [36]:
lasso = Lasso(alpha)

In [37]:
lasso.fit(X_train, y_train)


In [38]:
print('training lasso score:',lasso.score(X_train, y_train))
print('test lasso score:',lasso.score(X_test, y_test))

training lasso score: 0.9286226506436496
test lasso score: 0.9271819628605985


In [39]:
lr.score(X_test, y_test),lasso.score(X_test, y_test)

(0.9288691234090359, 0.9271819628605985)

Test score for linear regression and Lasso are closer to each other. So It can be said that Linear regression model is not overfitting.

In [40]:
ridgecv = RidgeCV(alphas = np.arange(0.01,0.11,0.01))
ridgecv

In [41]:
ridgecv.fit(X_train, y_train)

In [42]:
print('training ridge score:',ridgecv.score(X_train, y_train))
print('test ridge score:',ridgecv.score(X_test, y_test))

training ridge score: 0.929633600444302
test ridge score: 0.9288704010817636


In [43]:
lr.score(X_test, y_test),ridgecv.score(X_test, y_test)

(0.9288691234090359, 0.9288704010817636)

Test score for linear regression and ridge are closer to each other. So It can be said that Linear regression model is not overfitting. 

In [44]:
rfr = RandomForestRegressor()

In [45]:
rfr.fit(X_train, y_train)


In [46]:
y_rfr_pred = rfr.predict(X_test)
y_rfr_pred

array([2.96235, 3.70571, 2.07628, 3.86201, 3.42847, 2.21371, 3.21151,
       2.80869, 3.38305, 3.74307, 2.83949, 3.36554, 3.50764, 2.62112,
       3.84156, 2.13268, 2.87566, 3.85822, 3.66598, 2.49409, 2.72729,
       2.88744, 3.02217, 2.62823, 2.26639, 2.67786, 3.63222, 3.31961,
       2.33487, 3.15295, 3.40686, 2.69337, 2.07595, 2.07286, 3.74604,
       2.49916, 2.90248, 2.46744, 3.4368 , 3.01825, 2.78137, 2.78465,
       3.15939, 2.95216, 2.52832, 2.22933, 3.75726, 3.06437, 2.51808,
       3.20534, 2.24129, 2.96274, 2.36289, 3.80262, 3.0402 , 3.40952,
       3.36386, 3.17157, 3.60966, 2.62505, 3.73493, 3.35142, 3.8057 ,
       2.04759, 3.2172 , 2.47002, 3.13384, 2.4324 , 2.99423, 3.17425,
       3.1796 , 3.15402, 3.55056, 2.78051, 3.16483, 2.50482, 2.25575,
       1.57571, 2.90619, 2.03335, 3.27122, 2.56376, 2.67826, 2.79799,
       2.67842, 3.2696 , 3.68265, 2.52821, 3.22256, 2.88985, 3.63757,
       3.86937, 2.804  , 3.78679, 3.50208, 3.8157 , 2.4571 , 2.71958,
       2.1158 , 2.71

In [47]:
print('Train score for Random forest:',rfr.score(X_train, y_train))
print('Test score for Random forest:',rfr.score(X_test, y_test))

Train score for Random forest: 0.9851545868594451
Test score for Random forest: 0.9447419293577618


In [48]:
r2_score(y_test,y_rfr_pred)

0.9447419293577618

In [49]:
print('Mean squared error for Random forest regressor:', mean_squared_error(y_test,y_rfr_pred))
print('Mean squared error for Linear regressoin model:', mean_squared_error(y_test,y_lr_pred))

Mean squared error for Random forest regressor: 0.02069247381913044
Mean squared error for Linear regressoin model: 0.026636358897142737
