In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading data

df = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset4/main/Grades.csv')

In [3]:
df

Unnamed: 0,Seat No.,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
0,CS-97001,B-,D+,C-,C,C-,D+,D,C-,B-,...,C-,C-,C-,C-,A-,A,C-,B,A-,2.205
1,CS-97002,A,D,D+,D,B-,C,D,A,D+,...,D+,D,C,D,A-,B-,C,C,B,2.008
2,CS-97003,A,B,A,B-,B+,A,B-,B+,A-,...,B,B,A,C,A,A,A,A-,A,3.608
3,CS-97004,D,C+,D+,D,D,A-,D+,C-,D,...,D+,C,D+,C-,B-,B,C+,C+,C+,1.906
4,CS-97005,A-,A-,A-,B+,A,A,A-,B+,A,...,B-,B+,B+,B-,A-,A,A-,A-,A,3.448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,CS-97567,B,A,A,A-,A+,A,A-,A-,A+,...,A-,A-,A,A,A,B+,B+,B,A,3.798
567,CS-97568,A+,A,A,A,A,A,A,A-,A,...,B+,B+,A,A,A-,B,A-,C,A-,3.772
568,CS-97569,B,A,A-,B+,A,A,A,A,A,...,A-,B,A,B+,A,C,B+,A-,A-,3.470
569,CS-97570,A,B+,D,A,D,D+,B-,C-,B-,...,D,B,B,C-,D,C,B,B-,C,2.193


the dataset contains 571 rows and 43 columns 

In [4]:
# summary of problem statement

This is supervised machine learning problem based on regression algorithms

In [5]:
# info on data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 43 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seat No.   571 non-null    object 
 1   PH-121     571 non-null    object 
 2   HS-101     571 non-null    object 
 3   CY-105     570 non-null    object 
 4   HS-105/12  570 non-null    object 
 5   MT-111     569 non-null    object 
 6   CS-105     571 non-null    object 
 7   CS-106     569 non-null    object 
 8   EL-102     569 non-null    object 
 9   EE-119     569 non-null    object 
 10  ME-107     569 non-null    object 
 11  CS-107     569 non-null    object 
 12  HS-205/20  566 non-null    object 
 13  MT-222     566 non-null    object 
 14  EE-222     564 non-null    object 
 15  MT-224     564 non-null    object 
 16  CS-210     564 non-null    object 
 17  CS-211     566 non-null    object 
 18  CS-203     566 non-null    object 
 19  CS-214     565 non-null    object 
 20  EE-217    

- we have two data types: float and object
- there are missing values

In [6]:
# cross-checking CGPA type

df['CGPA'].dtype

dtype('float64')

In [7]:
# cross-checking CGPA type
df['CGPA'].nunique()

491

In [8]:
# cross-checking CGPA type
df['CGPA'].value_counts()
    

3.019    5
3.058    3
2.793    3
3.443    3
2.206    3
        ..
2.555    1
2.042    1
2.634    1
2.053    1
1.753    1
Name: CGPA, Length: 491, dtype: int64

we confirm CGPA is float data, as the number of unique values is close to the number of observations

In [9]:
# checking for duplicate

df.loc[df.duplicated(),]

Unnamed: 0,Seat No.,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA


there is no duplicate

In [10]:
# checking missing values 

df.isna().sum()

Seat No.      0
PH-121        0
HS-101        0
CY-105        1
HS-105/12     1
MT-111        2
CS-105        0
CS-106        2
EL-102        2
EE-119        2
ME-107        2
CS-107        2
HS-205/20     5
MT-222        5
EE-222        7
MT-224        7
CS-210        7
CS-211        5
CS-203        5
CS-214        6
EE-217        6
CS-212        6
CS-215        6
MT-331        9
EF-303       10
HS-304       10
CS-301       10
CS-302       10
TC-383       10
MT-442       10
EL-332        9
CS-318        9
CS-306        9
CS-312       10
CS-317       12
CS-403       12
CS-421       12
CS-406       85
CS-414       13
CS-419       13
CS-423       14
CS-412       79
CGPA          0
dtype: int64

- CS-406 alone has 85 NaN, which is already more than 10% of the data. This means that removing NaN will cause a serious problem of data lost
- we will replace the NaN by mode since they are all categorical columns

In [11]:
# converting df to series and filling NaN
df.fillna(df.mode().iloc[0], inplace = True)

In [12]:
df.isna().sum()

Seat No.     0
PH-121       0
HS-101       0
CY-105       0
HS-105/12    0
MT-111       0
CS-105       0
CS-106       0
EL-102       0
EE-119       0
ME-107       0
CS-107       0
HS-205/20    0
MT-222       0
EE-222       0
MT-224       0
CS-210       0
CS-211       0
CS-203       0
CS-214       0
EE-217       0
CS-212       0
CS-215       0
MT-331       0
EF-303       0
HS-304       0
CS-301       0
CS-302       0
TC-383       0
MT-442       0
EL-332       0
CS-318       0
CS-306       0
CS-312       0
CS-317       0
CS-403       0
CS-421       0
CS-406       0
CS-414       0
CS-419       0
CS-423       0
CS-412       0
CGPA         0
dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 43 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seat No.   571 non-null    object 
 1   PH-121     571 non-null    object 
 2   HS-101     571 non-null    object 
 3   CY-105     571 non-null    object 
 4   HS-105/12  571 non-null    object 
 5   MT-111     571 non-null    object 
 6   CS-105     571 non-null    object 
 7   CS-106     571 non-null    object 
 8   EL-102     571 non-null    object 
 9   EE-119     571 non-null    object 
 10  ME-107     571 non-null    object 
 11  CS-107     571 non-null    object 
 12  HS-205/20  571 non-null    object 
 13  MT-222     571 non-null    object 
 14  EE-222     571 non-null    object 
 15  MT-224     571 non-null    object 
 16  CS-210     571 non-null    object 
 17  CS-211     571 non-null    object 
 18  CS-203     571 non-null    object 
 19  CS-214     571 non-null    object 
 20  EE-217    

In [14]:
df.head()

Unnamed: 0,Seat No.,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
0,CS-97001,B-,D+,C-,C,C-,D+,D,C-,B-,...,C-,C-,C-,C-,A-,A,C-,B,A-,2.205
1,CS-97002,A,D,D+,D,B-,C,D,A,D+,...,D+,D,C,D,A-,B-,C,C,B,2.008
2,CS-97003,A,B,A,B-,B+,A,B-,B+,A-,...,B,B,A,C,A,A,A,A-,A,3.608
3,CS-97004,D,C+,D+,D,D,A-,D+,C-,D,...,D+,C,D+,C-,B-,B,C+,C+,C+,1.906
4,CS-97005,A-,A-,A-,B+,A,A,A-,B+,A,...,B-,B+,B+,B-,A-,A,A-,A-,A,3.448


In [15]:
df.tail()

Unnamed: 0,Seat No.,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
566,CS-97567,B,A,A,A-,A+,A,A-,A-,A+,...,A-,A-,A,A,A,B+,B+,B,A,3.798
567,CS-97568,A+,A,A,A,A,A,A,A-,A,...,B+,B+,A,A,A-,B,A-,C,A-,3.772
568,CS-97569,B,A,A-,B+,A,A,A,A,A,...,A-,B,A,B+,A,C,B+,A-,A-,3.47
569,CS-97570,A,B+,D,A,D,D+,B-,C-,B-,...,D,B,B,C-,D,C,B,B-,C,2.193
570,CS-97571,C,D,D,C,C,D+,B,C+,C,...,C+,C,B-,D,F,C-,B+,D,C-,1.753


In [16]:
# columns with highest unique value in course code columns
df.nunique().nlargest(13)

Seat No.    571
CGPA        491
CS-107       14
MT-222       14
MT-224       14
CS-210       14
CS-211       14
CS-215       14
HS-304       14
CS-318       14
CS-312       14
CS-406       14
PH-121       13
dtype: int64

In [17]:
# listing the grade codes 

for i in df.columns:
    print(df[i].value_counts())


CS-97001    1
CS-97384    1
CS-97378    1
CS-97379    1
CS-97380    1
           ..
CS-97185    1
CS-97184    1
CS-97183    1
CS-97182    1
CS-97571    1
Name: Seat No., Length: 571, dtype: int64
A-    112
A     111
B+     61
B      57
B-     56
D      44
C      33
C+     31
D+     22
A+     22
C-     19
WU      2
F       1
Name: PH-121, dtype: int64
A-    82
B-    78
C     68
B     63
B+    59
C-    50
C+    47
D     45
A     38
D+    36
A+     4
F      1
Name: HS-101, dtype: int64
A     178
A-    120
B+     50
B      49
B-     42
D      31
A+     31
C      19
C+     17
C-     16
D+     14
WU      3
F       1
Name: CY-105, dtype: int64
A     97
A-    75
B+    70
B     57
D     45
C     41
B-    40
C+    39
C-    36
D+    34
A+    34
WU     2
F      1
Name: HS-105/12, dtype: int64
A-    107
A     100
B-     70
B+     62
B      55
C-     39
C+     33
C      30
D      26
A+     23
D+     21
WU      3
F       2
Name: MT-111, dtype: int64
A     151
A-    134
B+     60
B      51
A+     43
B

- grade = [A-, A, A+, B-, B, B+, C-, C, C+, D, D+, F, I, W, WU]


In [18]:
# setting Seat No as index

df.set_index("Seat No.", inplace = True)

In [19]:
df

Unnamed: 0_level_0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
Seat No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS-97001,B-,D+,C-,C,C-,D+,D,C-,B-,C-,...,C-,C-,C-,C-,A-,A,C-,B,A-,2.205
CS-97002,A,D,D+,D,B-,C,D,A,D+,D,...,D+,D,C,D,A-,B-,C,C,B,2.008
CS-97003,A,B,A,B-,B+,A,B-,B+,A-,A-,...,B,B,A,C,A,A,A,A-,A,3.608
CS-97004,D,C+,D+,D,D,A-,D+,C-,D,C+,...,D+,C,D+,C-,B-,B,C+,C+,C+,1.906
CS-97005,A-,A-,A-,B+,A,A,A-,B+,A,A-,...,B-,B+,B+,B-,A-,A,A-,A-,A,3.448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS-97567,B,A,A,A-,A+,A,A-,A-,A+,B+,...,A-,A-,A,A,A,B+,B+,B,A,3.798
CS-97568,A+,A,A,A,A,A,A,A-,A,A,...,B+,B+,A,A,A-,B,A-,C,A-,3.772
CS-97569,B,A,A-,B+,A,A,A,A,A,B,...,A-,B,A,B+,A,C,B+,A-,A-,3.470
CS-97570,A,B+,D,A,D,D+,B-,C-,B-,C-,...,D,B,B,C-,D,C,B,B-,C,2.193


In [20]:
# encoding the features

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i]= enc.fit_transform(df[i].values.reshape(-1,1))

In [21]:
df

Unnamed: 0_level_0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
Seat No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS-97001,5.0,10.0,8.0,6.0,8.0,10.0,9.0,8.0,5.0,8.0,...,8.0,8.0,8.0,8.0,2.0,0.0,8.0,3.0,2.0,2.205
CS-97002,0.0,9.0,10.0,9.0,5.0,6.0,9.0,0.0,10.0,9.0,...,10.0,9.0,6.0,9.0,2.0,5.0,6.0,6.0,3.0,2.008
CS-97003,0.0,3.0,0.0,5.0,4.0,0.0,5.0,4.0,2.0,2.0,...,3.0,3.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,3.608
CS-97004,9.0,7.0,10.0,9.0,9.0,2.0,10.0,8.0,9.0,7.0,...,10.0,6.0,10.0,8.0,5.0,3.0,7.0,7.0,7.0,1.906
CS-97005,2.0,2.0,2.0,4.0,0.0,0.0,2.0,4.0,0.0,2.0,...,5.0,4.0,4.0,5.0,2.0,0.0,2.0,2.0,0.0,3.448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS-97567,3.0,0.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,4.0,...,2.0,2.0,0.0,0.0,0.0,4.0,4.0,3.0,0.0,3.798
CS-97568,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,4.0,4.0,0.0,0.0,2.0,3.0,2.0,6.0,2.0,3.772
CS-97569,3.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,...,2.0,3.0,0.0,4.0,0.0,6.0,4.0,2.0,2.0,3.470
CS-97570,0.0,4.0,9.0,0.0,9.0,10.0,5.0,8.0,5.0,8.0,...,9.0,3.0,3.0,8.0,9.0,6.0,3.0,5.0,6.0,2.193


In [22]:
df.corr()

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412,CGPA
PH-121,1.0,0.382436,0.557843,0.498561,0.468968,0.60999,0.293963,0.423942,0.579964,0.533558,...,0.038184,0.464022,0.321264,0.290351,0.280166,0.152437,0.46277,0.094921,0.293815,-0.663135
HS-101,0.382436,1.0,0.335074,0.473132,0.432742,0.400713,0.312019,0.349412,0.309697,0.336545,...,0.21338,0.36425,0.229546,0.301934,0.265551,0.225977,0.284668,0.228103,0.226039,-0.53717
CY-105,0.557843,0.335074,1.0,0.510123,0.53838,0.578573,0.378223,0.447539,0.46725,0.464418,...,0.261555,0.472137,0.464575,0.349827,0.129957,0.008774,0.430053,0.120885,0.064912,-0.658282
HS-105/12,0.498561,0.473132,0.510123,1.0,0.494731,0.482406,0.499093,0.360304,0.400303,0.320137,...,0.263143,0.478816,0.451907,0.415946,0.16815,0.142806,0.465352,0.235029,0.153293,-0.613989
MT-111,0.468968,0.432742,0.53838,0.494731,1.0,0.393684,0.500729,0.555895,0.409161,0.471685,...,0.467946,0.555098,0.41054,0.511463,0.300391,0.228637,0.502473,0.400679,0.269792,-0.730095
CS-105,0.60999,0.400713,0.578573,0.482406,0.393684,1.0,0.286324,0.353949,0.507357,0.45547,...,0.059711,0.387639,0.279329,0.236345,0.237615,0.121965,0.306827,0.045134,0.179777,-0.613169
CS-106,0.293963,0.312019,0.378223,0.499093,0.500729,0.286324,1.0,0.338645,0.269895,0.281233,...,0.450752,0.416562,0.426402,0.397195,0.129554,0.108916,0.421989,0.339963,0.185782,-0.534995
EL-102,0.423942,0.349412,0.447539,0.360304,0.555895,0.353949,0.338645,1.0,0.448908,0.46169,...,0.301324,0.466556,0.30424,0.368959,0.253133,0.242234,0.379308,0.369173,0.265289,-0.642262
EE-119,0.579964,0.309697,0.46725,0.400303,0.409161,0.507357,0.269895,0.448908,1.0,0.572964,...,0.156809,0.445496,0.341988,0.35923,0.297664,0.214165,0.465913,0.244615,0.320746,-0.646829
ME-107,0.533558,0.336545,0.464418,0.320137,0.471685,0.45547,0.281233,0.46169,0.572964,1.0,...,0.199848,0.462568,0.336647,0.345832,0.300913,0.154184,0.440407,0.220977,0.293638,-0.663023


As long as the CGPA is calculated based on all the course marks, we should keep all the features

In [23]:
# Separating features and target

x = df.drop('CGPA', axis = 1)

y = df['CGPA']


In [24]:
x

Unnamed: 0_level_0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
Seat No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS-97001,5.0,10.0,8.0,6.0,8.0,10.0,9.0,8.0,5.0,8.0,...,6.0,8.0,8.0,8.0,8.0,2.0,0.0,8.0,3.0,2.0
CS-97002,0.0,9.0,10.0,9.0,5.0,6.0,9.0,0.0,10.0,9.0,...,9.0,10.0,9.0,6.0,9.0,2.0,5.0,6.0,6.0,3.0
CS-97003,0.0,3.0,0.0,5.0,4.0,0.0,5.0,4.0,2.0,2.0,...,2.0,3.0,3.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0
CS-97004,9.0,7.0,10.0,9.0,9.0,2.0,10.0,8.0,9.0,7.0,...,8.0,10.0,6.0,10.0,8.0,5.0,3.0,7.0,7.0,7.0
CS-97005,2.0,2.0,2.0,4.0,0.0,0.0,2.0,4.0,0.0,2.0,...,2.0,5.0,4.0,4.0,5.0,2.0,0.0,2.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS-97567,3.0,0.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,4.0,...,4.0,2.0,2.0,0.0,0.0,0.0,4.0,4.0,3.0,0.0
CS-97568,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,2.0,4.0,4.0,0.0,0.0,2.0,3.0,2.0,6.0,2.0
CS-97569,3.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,2.0,3.0,0.0,4.0,0.0,6.0,4.0,2.0,2.0
CS-97570,0.0,4.0,9.0,0.0,9.0,10.0,5.0,8.0,5.0,8.0,...,4.0,9.0,3.0,3.0,8.0,9.0,6.0,3.0,5.0,6.0


In [25]:
# standardise x
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
x

Unnamed: 0,PH-121,HS-101,CY-105,HS-105/12,MT-111,CS-105,CS-106,EL-102,EE-119,ME-107,...,CS-306,CS-312,CS-317,CS-403,CS-421,CS-406,CS-414,CS-419,CS-423,CS-412
0,0.400402,1.770900,1.722261,0.549869,1.374222,2.657832,1.790004,1.333963,0.419492,1.024565,...,0.663660,1.169315,1.155824,1.533685,0.958939,-0.399094,-0.960095,1.740513,-0.346121,-0.450178
1,-1.242052,1.411559,2.397448,1.488043,0.369509,1.173242,1.790004,-1.307365,2.302589,1.342687,...,1.715324,1.764660,1.521056,0.862110,1.325686,-0.399094,1.042149,0.900418,0.727983,-0.021043
2,-1.242052,-0.744483,-0.978490,0.237145,0.034605,-1.053643,0.322008,0.013299,-0.710366,-0.884168,...,-0.738559,-0.319046,-0.670339,-1.152616,0.225444,-1.210063,-0.960095,-1.619869,-0.704156,-1.308446
3,1.714365,0.692879,2.397448,1.488043,1.709126,-0.311348,2.157003,1.333963,1.925970,0.706443,...,1.364769,1.764660,0.425358,2.205260,0.958939,0.817361,0.241251,1.320466,1.086017,1.695494
4,-0.585070,-1.103823,-0.303302,-0.075580,-1.305012,-1.053643,-0.778989,0.013299,-1.463605,-0.884168,...,-0.738559,0.276298,-0.305107,0.190534,-0.141304,-0.399094,-0.960095,-0.779774,-0.704156,-1.308446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,-0.256579,-1.822504,-0.978490,-0.701029,-0.970108,-1.053643,-0.778989,-0.647033,-1.086986,-0.247924,...,-0.037450,-0.616719,-1.035572,-1.152616,-1.975041,-1.210063,0.641700,0.060322,-0.346121,-1.308446
567,-0.913561,-1.822504,-0.978490,-1.326478,-1.305012,-1.053643,-1.512987,-0.647033,-1.463605,-1.520412,...,-0.738559,-0.021374,-0.305107,-1.152616,-1.975041,-0.399094,0.241251,-0.779774,0.727983,-0.450178
568,-0.256579,-1.822504,-0.303302,-0.075580,-1.305012,-1.053643,-1.512987,-1.307365,-1.463605,-0.566046,...,-0.037450,-0.616719,-0.670339,-1.152616,-0.508051,-1.210063,1.442597,0.060322,-0.704156,-0.450178
569,-1.242052,-0.385142,2.059854,-1.326478,1.709126,2.657832,0.322008,1.333963,0.419492,1.024565,...,-0.037450,1.466987,-0.670339,-0.145253,0.958939,2.439300,1.442597,-0.359726,0.369948,1.266360


In [26]:
# finding variance inflation in each column
from statsmodels.stats.outliers_influence import variance_inflation_factor

z = []

for i in range(len(x.columns)):
    z.append(variance_inflation_factor(x.values, i))
Vif = pd.DataFrame(z)
Vif['Features'] = x.columns

Vif.rename({Vif.columns[0]: 'VIF Values'}, axis = 'columns', inplace = True)


In [27]:
Vif

Unnamed: 0,VIF Values,Features
0,2.827608,PH-121
1,1.771294,HS-101
2,2.514966,CY-105
3,2.293035,HS-105/12
4,2.638921,MT-111
5,2.366597,CS-105
6,1.927701,CS-106
7,1.968027,EL-102
8,2.190768,EE-119
9,2.294552,ME-107


# Modelling 

In [28]:
# import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn .metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import cross_val_score

In [29]:
# finding the best random state. Here, I tried my own code but was just getting error. this is why I copy-pasted.
# I will keep trying and will surely come through with my own code

maxAccu = 0
maxRs = 0
for i in range(1, 200):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = i)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    pred = LR.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc>maxAccu:
        maxAccu = acc
        maxRs = i
print('Max r2_score is: ', maxAccu, 'at Random_state: ', maxRs)


Max r2_score is:  0.9687445129503746 at Random_state:  72


In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 72)

In [31]:
# training the data on each algorithm and CV

L = LinearRegression()
R = RandomForestRegressor()
G = GradientBoostingRegressor()
D = DecisionTreeRegressor()
K = KNN()
l = Lasso()


Mod = [L,R,G,D,K,l]

for model in Mod:
    model.fit(x_train,y_train)
    pred_model = model.predict(x_test)
    score = cross_val_score(model,x,y, cv = 7, scoring = 'r2')
    print('R2_score: ', r2_score(y_test, pred_model))
    print('Mean absolute error: ', mean_absolute_error(y_test, pred_model))
    print('Mean squared error: ', mean_squared_error(y_test, pred_model))
    print('Root mean squared error: ', np.sqrt(mean_squared_error(y_test, pred_model)))
    print(score)
    print(score.mean())
    print('The difference between R2 score and cross validation score for ', model, 'is: ', (r2_score(y_test, pred_model) - score.mean()))
    print('\n')

R2_score:  0.9687445129503746
Mean absolute error:  0.0844631873050909
Mean squared error:  0.012406125791165888
Root mean squared error:  0.11138278947470245
[0.83889517 0.93661203 0.95420812 0.89623244 0.9221749  0.94676189
 0.8176672 ]
0.901793106938204
The difference between R2 score and cross validation score for  LinearRegression() is:  0.06695140601217053


R2_score:  0.9398958573555253
Mean absolute error:  0.10988167832167818
Mean squared error:  0.02385691680419573
Root mean squared error:  0.1544568444718321
[0.83568593 0.94500228 0.93167827 0.9465598  0.91042749 0.92794368
 0.83214946]
0.9042067001089552
The difference between R2 score and cross validation score for  RandomForestRegressor() is:  0.03568915724657007


R2_score:  0.9686244152549934
Mean absolute error:  0.08249529081650091
Mean squared error:  0.012453795728727925
Root mean squared error:  0.111596575793023
[0.86687075 0.96103427 0.96244554 0.95622914 0.91329991 0.95743911
 0.83129766]
0.9212309113312406
The 

Our best model is RandomForestRegressor(): it has the least difference. the accuracy is the closest to cross-validation score

In [32]:
# hypertuning

from sklearn.model_selection import GridSearchCV

param = { 'min_impurity_decrease':[0,0.1,0.3],
         'n_estimators':[50,100],
        'random_state':[2,5,10],
         'criterion':['friedman_mse', 'squared_error'],
         'n_jobs':[-1,0,1]
        }

GV = GridSearchCV(RandomForestRegressor(), param, cv=5)
GV.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['friedman_mse', 'squared_error'],
                         'min_impurity_decrease': [0, 0.1, 0.3],
                         'n_estimators': [50, 100], 'n_jobs': [-1, 0, 1],
                         'random_state': [2, 5, 10]})

In [33]:
GV.best_params_

{'criterion': 'friedman_mse',
 'min_impurity_decrease': 0,
 'n_estimators': 50,
 'n_jobs': -1,
 'random_state': 2}

In [34]:
my_model = RandomForestRegressor(criterion = 'friedman_mse', min_impurity_decrease = 0, n_estimators = 50, n_jobs = -1, 
random_state= 2)
my_model.fit(x_train, y_train)
pred = my_model.predict(x_test)
print('R2_score: ', r2_score(y_test, pred)*100)
print('Mean absolute error: ', mean_absolute_error(y_test, pred))
print('Mean squared error: ', mean_squared_error(y_test, pred))
print('Root mean squared error: ', np.sqrt(mean_squared_error(y_test, pred)))

R2_score:  94.15798163804138
Mean absolute error:  0.10935874125874136
Mean squared error:  0.02318850912727277
Root mean squared error:  0.1522777368076922


# Saving the model

In [35]:
import pickle

pickle.dump(my_model, open('CGPA_Romuald.pkl', 'wb'))

In [36]:
saved_model = pickle.load(open('CGPA_Romuald.pkl', 'rb'))

In [37]:
saved_model.predict(x_test)

array([2.14064, 3.4556 , 3.15962, 2.44114, 2.99068, 3.72322, 3.35618,
       3.87116, 3.43802, 3.84648, 3.90724, 3.62628, 3.68872, 2.11614,
       2.55394, 2.20982, 2.38646, 3.55722, 3.76538, 3.01162, 3.6653 ,
       2.4997 , 2.23894, 2.21344, 3.36968, 3.37012, 3.72298, 1.94028,
       2.64926, 2.56112, 2.36778, 2.30808, 2.05894, 2.7059 , 2.69804,
       2.29812, 3.72988, 2.59184, 2.66814, 2.4467 , 2.1477 , 3.43812,
       3.62336, 3.41482, 3.0547 , 3.34152, 3.89004, 3.09456, 3.39826,
       2.40062, 3.59186, 3.28724, 2.46636, 3.24654, 2.9537 , 2.50584,
       2.70968, 3.53956, 3.82614, 2.52624, 3.14358, 2.24948, 3.61314,
       3.8177 , 3.7507 , 3.28192, 2.57936, 3.26466, 2.9671 , 2.37944,
       2.955  , 3.59354, 1.9387 , 2.5981 , 3.77512, 3.48778, 2.2577 ,
       2.8681 , 2.75348, 2.6599 , 3.7452 , 2.93718, 2.61346, 2.64096,
       2.51546, 2.2065 , 2.97668, 2.41746, 3.90068, 2.55984, 2.70366,
       2.94194, 3.1805 , 2.8252 , 3.51862, 2.85706, 2.19518, 2.65876,
       3.35584, 3.94

In [38]:
y_test

Seat No.
CS-97513    2.288
CS-97532    3.566
CS-97488    3.114
CS-97345    2.384
CS-97184    2.311
            ...  
CS-97541    3.765
CS-97137    1.714
CS-97178    3.369
CS-97463    3.339
CS-97470    3.714
Name: CGPA, Length: 143, dtype: float64