# preprocessing data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = 'grades.csv'
df = pd.read_csv(data, header=None)

In [4]:
df.shape

(73, 9)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,PUPIL_SEX,PUPIL_CLASS,TEACHER_RIGHT,TEACHER_CHK,TEACHER_QUEST,TEACHER_CORR,PUPIL_CORR,PUPIL_STRIP,GRADE
1,F,8A,65,0,4,2,1,6,4-
2,F,8A,70,4,0,4,0,4,3
3,F,8A,85,0,0,4,3,5,4
4,M,8A,55,0,0,1,8,3,3


In [6]:
col_names = [df[i][0] for i in range(df.shape[1])]
col_names

['PUPIL_SEX',
 'PUPIL_CLASS',
 'TEACHER_RIGHT',
 'TEACHER_CHK',
 'TEACHER_QUEST',
 'TEACHER_CORR',
 'PUPIL_CORR',
 'PUPIL_STRIP',
 'GRADE']

In [7]:
df = df[1:]
df.columns = col_names
df

Unnamed: 0,PUPIL_SEX,PUPIL_CLASS,TEACHER_RIGHT,TEACHER_CHK,TEACHER_QUEST,TEACHER_CORR,PUPIL_CORR,PUPIL_STRIP,GRADE
1,F,8A,65,0,4,2,1,6,4-
2,F,8A,70,4,0,4,0,4,3
3,F,8A,85,0,0,4,3,5,4
4,M,8A,55,0,0,1,8,3,3
5,M,8A,40,1,2,0,3,4,2
...,...,...,...,...,...,...,...,...,...
68,F,8A,80,1,0,2,1,7,4-
69,F,8A,85,0,0,1,2,2,4
70,F,8A,90,0,0,0,1,1,4
71,F,8A,95,0,0,3,1,2,5-


#data analysis

In [8]:
for col in col_names:
    print(df[col].value_counts())

F    58
M    14
Name: PUPIL_SEX, dtype: int64
8A    54
8B    18
Name: PUPIL_CLASS, dtype: int64
100    17
60      7
80      6
40      6
0       5
50      4
30      4
90      4
85      3
70      3
20      2
65      2
55      2
95      2
87      1
75      1
86      1
67      1
15      1
Name: TEACHER_RIGHT, dtype: int64
0    56
1    13
4     1
3     1
2     1
Name: TEACHER_CHK, dtype: int64
0    51
1    11
2     6
3     3
4     1
Name: TEACHER_QUEST, dtype: int64
0    23
1    18
2    16
4     7
3     4
6     2
5     2
Name: TEACHER_CORR, dtype: int64
0     30
1     18
3      7
2      7
4      3
5      3
8      2
10     1
9      1
Name: PUPIL_CORR, dtype: int64
0     18
1     14
2     10
4      8
5      8
3      4
6      2
12     2
7      2
8      1
9      1
10     1
11     1
Name: PUPIL_STRIP, dtype: int64
4     20
3     13
2     11
5     11
4-     8
5-     7
3-     2
Name: GRADE, dtype: int64


In [9]:
df.isnull().sum()

PUPIL_SEX        0
PUPIL_CLASS      0
TEACHER_RIGHT    0
TEACHER_CHK      0
TEACHER_QUEST    0
TEACHER_CORR     0
PUPIL_CORR       0
PUPIL_STRIP      0
GRADE            0
dtype: int64

In [10]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.4.0-py2.py3-none-any.whl (86 kB)
[?25l[K     |███▉                            | 10 kB 20.2 MB/s eta 0:00:01[K     |███████▋                        | 20 kB 13.1 MB/s eta 0:00:01[K     |███████████▍                    | 30 kB 10.1 MB/s eta 0:00:01[K     |███████████████▏                | 40 kB 9.1 MB/s eta 0:00:01[K     |███████████████████             | 51 kB 4.4 MB/s eta 0:00:01[K     |██████████████████████▊         | 61 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████▌     | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 81 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████████| 86 kB 3.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.0


In [11]:
from sklearn.model_selection import train_test_split
import category_encoders as ce

# splitting the dataset function
def make_data(df, test_size_param):
  X = df.drop(['GRADE'], axis=1)
  y = df['GRADE']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size_param, random_state = 42)
  encoder = ce.OrdinalEncoder(cols=['PUPIL_SEX','PUPIL_CLASS','TEACHER_RIGHT','TEACHER_CHK','TEACHER_QUEST',
                                  'TEACHER_CORR','PUPIL_CORR','PUPIL_STRIP'])
  X_train = encoder.fit_transform(X_train)
  X_test = encoder.transform(X_test)

  return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = make_data(df, test_size_param=0.15)
X_train

Unnamed: 0,PUPIL_SEX,PUPIL_CLASS,TEACHER_RIGHT,TEACHER_CHK,TEACHER_QUEST,TEACHER_CORR,PUPIL_CORR,PUPIL_STRIP
32,1,1,1,1,1,1,1,1
10,1,1,1,2,1,2,2,2
46,2,2,2,1,2,1,1,1
6,2,1,3,1,3,3,3,3
23,1,1,1,1,1,4,1,2
...,...,...,...,...,...,...,...,...
24,1,1,6,1,1,1,5,5
21,1,1,13,1,3,1,1,2
61,1,1,1,1,1,4,4,4
15,1,1,19,2,1,2,4,1


#modeling tree

In [20]:
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn import tree

In [31]:
def boostingClassification(df, n_estimators_=100, learning_rate_=1.0, max_depth_=1, random_state_=0, test_size_param=0.15):
  X_train, X_test, y_train, y_test = make_data(df, test_size_param)
  clf_en = GradientBoostingClassifier(n_estimators=n_estimators_, learning_rate=learning_rate_, max_depth=max_depth_, random_state=random_state_)
  clf_en.fit(X_train, y_train)
  y_pred_en = clf_en.predict(X_test)

  print(classification_report(y_test, clf_en.predict(X_test)))
  return clf_en

In [32]:
def baggingClassification(df, max_samples_=0.5, max_features_=0.5, test_size_param=0.15):
  X_train, X_test, y_train, y_test = make_data(df, test_size_param)
  clf_en = BaggingClassifier(KNeighborsClassifier(), max_samples=max_samples_, max_features=max_features_)
  clf_en.fit(X_train, y_train)
  y_pred_en = clf_en.predict(X_test)

  print(classification_report(y_test, clf_en.predict(X_test)))
  return clf_en

In [33]:
def randomForestsClassification(df, n_estimators_=100, test_size_param=0.15):
  X_train, X_test, y_train, y_test = make_data(df, test_size_param)
  clf_en = RandomForestClassifier(n_estimators=n_estimators_)
  clf_en.fit(X_train, y_train)
  y_pred_en = clf_en.predict(X_test)

  print(classification_report(y_test, clf_en.predict(X_test)))
  return clf_en

In [56]:
# creating report
def make_classification(df, clf, params={}, random_state=0, test_size_param=0.15):
  if clf == 'boosting':
    boostingClassification(df, params['n_estimators'], params['learning_rate'], params['max_depth'], random_state, test_size_param)
  elif clf == 'bagging':
    baggingClassification(df, params['max_samples'], params['max_features'], test_size_param)
  elif clf == 'rforests':
    randomForestsClassification(df, params['n_estimators'],test_size_param)

In [34]:
make_classification(df, clf="boosting", params={'n_estimators':100, 'learning_rate':0.1, 'max_depth': 4})

              precision    recall  f1-score   support

           2       1.00      0.67      0.80         3
           3       0.00      0.00      0.00         2
           4       0.20      1.00      0.33         1
          4-       1.00      0.50      0.67         2
           5       1.00      0.50      0.67         2
          5-       1.00      1.00      1.00         1

    accuracy                           0.55        11
   macro avg       0.70      0.61      0.58        11
weighted avg       0.75      0.55      0.58        11



In [67]:
make_classification(df, clf="bagging", params={'max_samples':1.0, 'max_features':1.0})

              precision    recall  f1-score   support

           2       0.67      0.67      0.67         3
           3       0.50      0.50      0.50         2
           4       0.20      1.00      0.33         1
          4-       0.00      0.00      0.00         2
           5       1.00      0.50      0.67         2
          5-       0.00      0.00      0.00         1

    accuracy                           0.45        11
   macro avg       0.39      0.44      0.36        11
weighted avg       0.47      0.45      0.42        11



In [68]:
make_classification(df, clf="rforests", params={'n_estimators':50})

              precision    recall  f1-score   support

           2       1.00      0.67      0.80         3
           3       0.25      0.50      0.33         2
           4       0.00      0.00      0.00         1
          4-       0.50      0.50      0.50         2
           5       0.67      1.00      0.80         2
          5-       0.00      0.00      0.00         1

    accuracy                           0.55        11
   macro avg       0.40      0.44      0.41        11
weighted avg       0.53      0.55      0.52        11



In [69]:
make_classification(df, clf="rforests", params={'n_estimators':60})

              precision    recall  f1-score   support

           2       0.67      0.67      0.67         3
           3       0.33      0.50      0.40         2
           4       1.00      1.00      1.00         1
          4-       1.00      0.50      0.67         2
           5       1.00      0.50      0.67         2
          5-       0.50      1.00      0.67         1

    accuracy                           0.64        11
   macro avg       0.75      0.69      0.68        11
weighted avg       0.74      0.64      0.65        11



In [70]:
make_classification(df, clf="rforests", params={'n_estimators':70})

              precision    recall  f1-score   support

           2       0.67      0.67      0.67         3
           3       0.00      0.00      0.00         2
           4       0.25      1.00      0.40         1
          4-       1.00      0.50      0.67         2
           5       1.00      0.50      0.67         2
          5-       1.00      1.00      1.00         1

    accuracy                           0.55        11
   macro avg       0.65      0.61      0.57        11
weighted avg       0.66      0.55      0.55        11



In [71]:
make_classification(df, clf="rforests", params={'n_estimators':80})

              precision    recall  f1-score   support

           2       1.00      0.67      0.80         3
           3       0.33      0.50      0.40         2
           4       0.33      1.00      0.50         1
          4-       0.00      0.00      0.00         2
           5       1.00      0.50      0.67         2
          5-       1.00      1.00      1.00         1

    accuracy                           0.55        11
   macro avg       0.61      0.61      0.56        11
weighted avg       0.64      0.55      0.55        11



In [72]:
make_classification(df, clf="rforests", params={'n_estimators':90})

              precision    recall  f1-score   support

           2       0.67      0.67      0.67         3
           3       1.00      0.50      0.67         2
           4       0.25      1.00      0.40         1
          4-       1.00      0.50      0.67         2
           5       1.00      0.50      0.67         2
          5-       1.00      1.00      1.00         1

    accuracy                           0.64        11
   macro avg       0.82      0.69      0.68        11
weighted avg       0.84      0.64      0.67        11



In [74]:
make_classification(df, clf="rforests", params={'n_estimators':100})

              precision    recall  f1-score   support

           2       1.00      0.67      0.80         3
           3       0.33      0.50      0.40         2
           4       0.50      1.00      0.67         1
          4-       0.50      0.50      0.50         2
           5       1.00      0.50      0.67         2
          5-       1.00      1.00      1.00         1

    accuracy                           0.64        11
   macro avg       0.72      0.69      0.67        11
weighted avg       0.74      0.64      0.65        11

