In [12]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
from sklearn.metrics import accuracy_score
from sklearn import linear_model
import joblib # for saving algorithm and preprocessing objects

In [13]:
data = pd.read_csv("student-mat.csv") 
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [14]:
x_cols = [c for c in data.columns if c != 'G3']
X = data[x_cols]
y = data['G3']
# show first rows of data

In [15]:
X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,4,3,4,1,1,3,6,5,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,no,5,3,3,1,1,3,4,5,5
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,no,4,3,2,2,3,3,10,7,8
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,3,2,2,1,1,5,2,15,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,4,3,2,1,2,5,4,6,10


In [16]:
y.head()

0     6
1     6
2    10
3    15
4    10
Name: G3, dtype: int64

In [17]:
print(len(y))

395


In [18]:
print(y)

0       6
1       6
2      10
3      15
4      10
       ..
390     9
391    16
392     7
393    10
394     9
Name: G3, Length: 395, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [20]:
print(data.columns)


Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')


In [21]:
# convert categoricals
encoders = {}
for column in ['school', 'sex', 'address',
                'famsize', 'Pstatus', 'Mjob',
                'Fjob','reason','guardian','schoolsup','famsup','paid','activities','nursery','higher','internet','romantic',]:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    X_test[column] = categorical_convert.fit_transform(X_test[column])
    encoders[column] = categorical_convert

In [22]:
X_train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
42,0,1,15,1,0,1,4,4,3,4,...,0,4,3,3,1,1,5,2,19,18
0,0,0,18,1,0,0,4,4,0,4,...,0,4,3,4,1,1,3,6,5,6
345,0,0,18,1,0,1,3,2,2,3,...,1,5,4,3,2,3,1,7,13,13
375,1,0,18,0,0,1,1,1,2,2,...,0,4,3,2,1,2,4,2,8,8
74,0,0,16,1,0,1,3,3,2,3,...,0,4,3,3,2,4,5,54,11,12


In [23]:
y_train.head()

42     18
0       6
345    14
375    10
74     11
Name: G3, dtype: int64

In [24]:
X_test.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
40,0,0,16,1,1,1,2,2,2,2,...,1,3,3,3,1,2,3,25,7,10
44,0,0,16,1,1,1,2,2,2,0,...,0,4,3,3,2,2,5,14,10,10
357,1,0,17,1,1,0,3,2,3,2,...,1,1,2,3,1,2,5,2,12,12
59,0,0,16,1,0,1,4,2,3,2,...,0,4,2,3,1,1,5,2,15,16
348,0,0,17,1,0,1,4,3,1,2,...,1,4,4,3,1,3,4,0,13,15


In [25]:
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [26]:
y_pred = rf.predict(X_test)

In [27]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.3697478991596639


In [28]:
linear = linear_model.LinearRegression()
linear = linear.fit(X_train, y_train) 
acc = linear.score(X_test, y_test) 
print("Accuracy:", acc)

Accuracy: 0.8131116858130827


In [29]:
joblib.dump(linear, "./linear_reg2.joblib", compress=True)

['./linear_reg2.joblib']

In [30]:
joblib.dump(encoders, "./encoders_lin2.joblib", compress=True)

['./encoders_lin2.joblib']

In [31]:
print(y_pred)

[10 10 11 15 15  6  0 11  6  0 15 11 15  8 11 15 16 10  0 11  8  9  8 15
  0 18 15 11  9 10 10 14 15 15  8 10 11 10 12 10 15 14  9  0  9  8  9  0
 11  9 11 15 10 10 15 11 13  9  0 13  9  0  0  9 11 10 10  9 15 15 11 12
  9 13 13 14 11  0 12 11 13  8  8 15  0 13 10  9  0  8  0  6  8 13 15  8
 13  8  0  0 15 11 11 14 15 15  8  9 13 10 10 10 13 11  0 11 11  0  0]


In [32]:
print(y_test)

40     11
44      9
357    11
59     16
348    15
       ..
144     0
302    14
133    11
264     0
387     0
Name: G3, Length: 119, dtype: int64


In [56]:
data = {
            "school": "GP",
            "sex": "F",
            "age": 18,
            "address": "U",
            "famsize": "GT3",
            "Pstatus": "A",
            "Medu": 4,
            "Fedu": 4,
            "Mjob": "at_home",
            "Fjob": "teacher",
            "reason": "course",
            "guardian": "mother",
            "traveltime": 2,
            "studytime": 2,
            "failures": 0,
            "schoolsup": "yes",
            "famsup": "no",
            "paid": "no",
            "activities": "no",
            "nursery": "yes",
            "higher": "yes",
            "internet": "no",
            "romantic": "no",
            "famrel": 4,
            "freetime": 3,
            "goout": 4,
            "Dalc": 1,
            "Walc": 1,
            "health": 3,
            "absences": 6,
            "G1": 15,
            "G2": 16,
        }

In [57]:
class RandomForest:
    def __init__(self):
        self.encoders = encoders
        self.model = rf

    def preprocessing(self, input_data):
        # JSON to pandas DataFrame
        input_data = pd.DataFrame(input_data, index=[0])
        # convert categoricals
        for column in [
            "school",
            "sex",
            "address",
            "famsize",
            "Pstatus",
            "Mjob",
            "Fjob",
            "reason",
            "guardian",
            "schoolsup",
            "famsup",
            "paid",
            "activities",
            "nursery",
            "higher",
            "internet",
            "romantic",
        ]:
            categorical_convert = self.encoders[column]
            input_data[column] = categorical_convert.transform(input_data[column])

        return input_data

    def predict(self, input_data):
        return self.model.predict(input_data)
        

    def postprocessing(self, input_data):
        label = "fail"
        if input_data > 13:
            label = "pass"
        return {"prediction": input_data, "label": label, "status": "OK"}

    def compute_prediction(self, input_data):
        try:
            input_data = self.preprocessing(input_data)
            print(input_data)
            prediction = self.predict(input_data)  # only one sample
            print(prediction)
            prediction = self.postprocessing(prediction)
        except Exception as e:
            return {"status": "Error", "message": str(e), "prediction": self.predict(input_data)[0]}

        return prediction


In [58]:
print(data)

{'school': 'GP', 'sex': 'F', 'age': 18, 'address': 'U', 'famsize': 'GT3', 'Pstatus': 'A', 'Medu': 4, 'Fedu': 4, 'Mjob': 'at_home', 'Fjob': 'teacher', 'reason': 'course', 'guardian': 'mother', 'traveltime': 2, 'studytime': 2, 'failures': 0, 'schoolsup': 'yes', 'famsup': 'no', 'paid': 'no', 'activities': 'no', 'nursery': 'yes', 'higher': 'yes', 'internet': 'no', 'romantic': 'no', 'famrel': 4, 'freetime': 3, 'goout': 4, 'Dalc': 1, 'Walc': 1, 'health': 3, 'absences': 6, 'G1': 15, 'G2': 16}


In [59]:
RandomForest().compute_prediction(data)

   school  sex  age  address  famsize  Pstatus  Medu  Fedu  Mjob  Fjob  ...  \
0       0    0   18        1        0        0     4     4     0     4  ...   

   romantic  famrel  freetime  goout  Dalc  Walc  health  absences  G1  G2  
0         0       4         3      4     1     1       3         6  15  16  

[1 rows x 32 columns]
[15]


{'prediction': array([15]), 'label': 'pass', 'status': 'OK'}