## Importing the required libraries 

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### Loading the data set and investigating it .

In [3]:
data=pd.read_csv("My_data.csv")
data.head()

Unnamed: 0,0,Termid,RegdNo,Course,Grade,CA_100,MTT_50,ETT_100,ETP_100,Course_Att,...,CA_3,CA_4,Height,Weight,ScholarType,Direction,Gender,Medium,CourseType,ProgramType
0,392,418192,1198776,TJZ267,O,89.0,,,75.0,,...,1.0,3.0,171,67,Day Scholar,East,Male,Regional,Theory,UG
1,393,418192,1198776,TJZ268,A,87.0,,,56.0,99.0,...,11.0,6.0,171,67,Day Scholar,East,Male,Regional,Theory,UG
2,394,418192,1198776,TJZ269,B+,74.0,,,57.0,100.0,...,3.0,0.0,171,67,Day Scholar,East,Male,Regional,Theory,UG
3,593,218192,1273776,TJZ20,O,90.0,39.0,85.0,,92.0,...,1.0,1.0,165,76,Day Scholar,North,Female,English,Theory,UG
4,594,218192,1273776,TJZ21,B+,77.0,30.0,49.0,,85.0,...,2.0,1.0,165,76,Day Scholar,North,Female,English,Theory,UG


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3917 entries, 0 to 3916
Data columns (total 23 columns):
0              3917 non-null int64
Termid         3917 non-null int64
RegdNo         3917 non-null int64
Course         3917 non-null object
Grade          3917 non-null object
CA_100         3794 non-null float64
MTT_50         2574 non-null float64
ETT_100        2601 non-null float64
ETP_100        1316 non-null float64
Course_Att     3680 non-null float64
MHRDName       3917 non-null object
CA_1           3794 non-null float64
CA_2           3794 non-null float64
CA_3           3794 non-null float64
CA_4           3794 non-null float64
Height         3917 non-null int64
Weight         3917 non-null int64
ScholarType    3917 non-null object
Direction      3917 non-null object
Gender         3917 non-null object
Medium         3917 non-null object
CourseType     3917 non-null object
ProgramType    3917 non-null object
dtypes: float64(9), int64(5), object(9)
memory usage: 704.0+ 

### Finding and deleting duplicate rows 

In [5]:
sum(data.duplicated())

0

#### Since the data has no duplicates we will not drop the duplicates 

### We will see numbers of unique values in each columns and also see Correlation between the columns 

In [6]:
allcolumns=data.columns
for item in allcolumns:
    print(data[item].nunique())


3917
4
545
100
12
95
49
85
86
92
1
93
81
64
62
35
61
2
4
2
3
2
1


In [7]:
correl=data.corr()
print(correl)

                   0    Termid    RegdNo    CA_100    MTT_50   ETT_100  \
0           1.000000 -0.029304  0.999907 -0.038962 -0.024238 -0.047602   
Termid     -0.029304  1.000000 -0.028428  0.055018  0.071740 -0.002854   
RegdNo      0.999907 -0.028428  1.000000 -0.039575 -0.024678 -0.048274   
CA_100     -0.038962  0.055018 -0.039575  1.000000  0.542311  0.572094   
MTT_50     -0.024238  0.071740 -0.024678  0.542311  1.000000  0.566105   
ETT_100    -0.047602 -0.002854 -0.048274  0.572094  0.566105  1.000000   
ETP_100    -0.099069  0.242622 -0.100140  0.658175       NaN       NaN   
Course_Att -0.058904 -0.008296 -0.059951  0.589069  0.460177  0.472513   
CA_1        0.012194  0.021079  0.011808  0.479819  0.261051  0.306289   
CA_2       -0.022118  0.026489 -0.022269  0.328831  0.183610  0.173556   
CA_3       -0.046653  0.036336 -0.046710  0.264453  0.140331  0.147100   
CA_4       -0.025801 -0.009849 -0.025942  0.254163  0.140555  0.118932   
Height     -0.042665  0.037840 -0.0420

### We have rows with null values so we will drop the rows with nan values in them .

In [8]:
data.fillna(data.mean(),axis=0,inplace=True)

In [9]:
x=data.drop(["Grade","RegdNo","Termid","Course","MHRDName","Height","Weight","ScholarType","CA_1","CA_2","CA_3","CA_4","Direction","Gender","Medium","CourseType","ProgramType"],axis=1)
y=data["Grade"]
print(x)
print(y)

          0  CA_100    MTT_50   ETT_100    ETP_100  Course_Att
0       392    89.0  24.22028  47.85198  75.000000   81.747283
1       393    87.0  24.22028  47.85198  56.000000   99.000000
2       394    74.0  24.22028  47.85198  57.000000  100.000000
3       593    90.0  39.00000  85.00000  64.837386   92.000000
4       594    77.0  30.00000  49.00000  64.837386   85.000000
...     ...     ...       ...       ...        ...         ...
3912  65041    71.0  24.22028  47.85198  50.000000   74.000000
3913  65042    80.0  24.22028  47.85198  76.000000   61.000000
3914  65043    52.0  27.00000  58.00000  64.837386   90.000000
3915  65044    65.0  20.00000  45.00000  64.837386   90.000000
3916  65045    41.0  25.00000  36.00000  64.837386   80.000000

[3917 rows x 6 columns]
0        O
1        A
2       B+
3        O
4       B+
        ..
3912    B+
3913    B+
3914    B+
3915     C
3916     C
Name: Grade, Length: 3917, dtype: object


### The grades are categorial and are objects (string) so we need to convert them in labels to do so we will do Data Mapping

In [10]:
mark_mapping={'O':0,'A+':1,'A':2,'B+':3,'B':4,'C':5,'D':6,'E':7,'F':8,'M':9,'I':10,'R':11}
y=y.replace(mark_mapping)

In [11]:
print(y)

0       0
1       2
2       3
3       0
4       3
       ..
3912    3
3913    3
3914    3
3915    5
3916    5
Name: Grade, Length: 3917, dtype: int64


In [12]:
y = y.astype(np.uint)

### We now make run the predictive algorithm to predict the grade and if grade is below 5 the student is FAIL  and if the grade is 5 or above we will get PASS as a result

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=101)
scale=StandardScaler()
x_train=scale.fit_transform(x_train)
print(x_train)

[[-6.76307345e-01  2.52344416e-01  3.15437604e-01  1.24221681e-01
   7.54351514e-03 -8.46514177e-01]
 [-8.76160125e-03 -7.35389326e-01  6.54838727e-01  8.59557970e-01
   7.54351514e-03  1.60138032e-02]
 [ 1.65649391e-01 -5.87229265e-01 -1.38156801e+00 -1.06362925e+00
   7.54351514e-03 -1.27777817e+00]
 ...
 [-4.75879243e-01 -3.05656362e+00 -2.73917250e+00 -2.70399482e+00
   7.54351514e-03 -2.44835186e+00]
 [-3.81616819e-01  5.48664538e-01  9.57542103e-04  2.72036897e-03
   5.86525647e-01  1.12497835e+00]
 [-9.33020826e-01 -9.33623941e-02  9.57542103e-04  2.72036897e-03
   5.86525647e-01 -1.15455988e+00]]


In [14]:
x_test=scale.fit_transform(x_test)
model = LogisticRegression()
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
predictedvalues=model.predict(x_test)
print(accuracy_score(y_test,predictedvalues))

0.41496598639455784


### We have only used logistic regression in this predictor, we can also use other ones for better prediction values like 
#### 1.RandomForestClassifier
#### 2.AdaBoostClassifier
#### 3.GradientBoostingClassifier 

In [16]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
predictedvalues=model.predict(x_test)
print(accuracy_score(y_test,predictedvalues))



0.5790816326530612


In [17]:
model = AdaBoostClassifier()
model.fit(x_train, y_train)
predictedvalues=model.predict(x_test)
print(accuracy_score(y_test,predictedvalues))

0.3129251700680272


In [18]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
predictedvalues=model.predict(x_test)
print(accuracy_score(y_test,predictedvalues))

0.5986394557823129


### Let's check for any value whether it is pass or fail

In [26]:
score_mapping = {mark_mapping[i]:i for i in mark_mapping}
final=score_mapping[model.predict(x.values[0].reshape(-1,6))[0]]

In [24]:
print("Final grade is ")
print(final)

Pass_garde=['O','A','A+','B','B+','C','D']
if(final in Pass_garde):
    print("PASS")
else :
    print("FAIL")

Final grade is 
O
PASS
