In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import keras
from tensorflow.keras import Model

%matplotlib inline
from termcolor import colored
plt.rcParams['axes.unicode_minus'] = False
pd.reset_option('display.float_format')
pd.set_option('display.max_columns', None)

color_scheme = px.colors.qualitative.Pastel

In [12]:
data= pd.read_csv("/content/students_adaptability_level_online_education.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               1205 non-null   object
 1   Age                  1205 non-null   object
 2   Education Level      1205 non-null   object
 3   Institution Type     1205 non-null   object
 4   IT Student           1205 non-null   object
 5   Location             1205 non-null   object
 6   Load-shedding        1205 non-null   object
 7   Financial Condition  1205 non-null   object
 8   Internet Type        1205 non-null   object
 9   Network Type         1205 non-null   object
 10  Class Duration       1205 non-null   object
 11  Self Lms             1205 non-null   object
 12  Device               1205 non-null   object
 13  Adaptivity Level     1205 non-null   object
dtypes: object(14)
memory usage: 131.9+ KB


In [13]:
data

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,Boy,21-25,University,Non Government,No,Yes,Low,Mid,Wifi,4G,3-6,No,Tab,Moderate
1,Girl,21-25,University,Non Government,No,Yes,High,Mid,Mobile Data,4G,1-3,Yes,Mobile,Moderate
2,Girl,16-20,College,Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Moderate
3,Girl,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
4,Girl,16-20,School,Non Government,No,Yes,Low,Poor,Mobile Data,3G,0,No,Mobile,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,Girl,16-20,College,Non Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Low
1201,Girl,16-20,College,Non Government,No,No,High,Mid,Wifi,4G,3-6,No,Mobile,Moderate
1202,Boy,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,3G,1-3,No,Mobile,Moderate
1203,Girl,16-20,College,Non Government,No,No,Low,Mid,Wifi,4G,1-3,No,Mobile,Low


### Label Encoding

In [14]:
columns = ['Gender','Age','Education Level','Institution Type','IT Student','Location',
           'Load-shedding','Financial Condition','Internet Type','Network Type',
           'Class Duration','Self Lms','Device','Adaptivity Level']

In [15]:
for column in columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

In [16]:
data.describe()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
count,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0,1205.0
mean,0.449793,2.122822,1.19668,0.682988,0.252282,0.775934,0.833195,0.341909,0.423237,1.627386,1.047303,0.174274,0.890456,1.435685
std,0.497679,1.210359,0.722437,0.465506,0.434503,0.417139,0.372956,0.605302,0.494277,0.515295,0.548559,0.379502,0.384003,0.642013
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
50%,0.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0
75%,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,2.0
max,1.0,5.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0


In [17]:
data

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,0,3,2,1,0,1,1,0,1,2,2,0,2,2
1,1,3,2,1,0,1,0,0,0,2,1,1,1,2
2,1,2,0,0,0,1,1,0,1,2,1,0,1,2
3,1,1,1,1,0,1,1,0,0,2,1,0,1,2
4,1,2,1,1,0,1,1,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,1,2,0,1,0,1,1,0,1,2,1,0,1,1
1201,1,2,0,1,0,0,0,0,1,2,2,0,1,2
1202,0,1,1,1,0,1,1,0,0,1,1,0,1,2
1203,1,2,0,1,0,0,1,0,1,2,1,0,1,1


### Train-Test-Split

In [18]:
y = data['Adaptivity Level']
X = data.drop('Adaptivity Level', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [19]:
print(colored('Shape of X_train: ','blue'), X_train.shape, '\n\n')

print(colored('Shape of X_test: ', 'red'), X_test.shape)

Shape of X_train:  (964, 13) 


Shape of X_test:  (241, 13)


In [30]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [44]:
from sklearn.model_selection import GridSearchCV
from xgboost                 import XGBClassifier


In [31]:
X

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device
0,0,3,2,1,0,1,1,0,1,2,2,0,2
1,1,3,2,1,0,1,0,0,0,2,1,1,1
2,1,2,0,0,0,1,1,0,1,2,1,0,1
3,1,1,1,1,0,1,1,0,0,2,1,0,1
4,1,2,1,1,0,1,1,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,1,2,0,1,0,1,1,0,1,2,1,0,1
1201,1,2,0,1,0,0,0,0,1,2,2,0,1
1202,0,1,1,1,0,1,1,0,0,1,1,0,1
1203,1,2,0,1,0,0,1,0,1,2,1,0,1


In [32]:
#Re-scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
X_train_scaled

array([[ 1.12143267, -1.76894005, -0.26449442, ..., -0.08673072,
         2.23328853,  0.27451836],
       [-0.89171648, -0.93670081, -0.26449442, ..., -0.08673072,
        -0.44777018,  0.27451836],
       [-0.89171648, -0.93670081, -0.26449442, ..., -0.08673072,
        -0.44777018,  0.27451836],
       ...,
       [-0.89171648, -0.93670081, -0.26449442, ..., -0.08673072,
        -0.44777018,  0.27451836],
       [-0.89171648,  0.72777768,  1.11373598, ..., -1.904305  ,
        -0.44777018,  0.27451836],
       [-0.89171648,  0.72777768,  1.11373598, ...,  1.73084355,
        -0.44777018, -2.34563704]])

In [34]:
X_test_scaled

array([[ 1.12143267, -0.93670081, -0.26449442, ..., -0.08673072,
        -0.44777018, -2.34563704],
       [-0.89171648, -0.10446156, -1.64272482, ..., -0.08673072,
        -0.44777018,  0.27451836],
       [-0.89171648, -0.10446156, -1.64272482, ..., -0.08673072,
         2.23328853,  2.89467375],
       ...,
       [ 1.12143267,  2.39225617, -0.26449442, ..., -0.08673072,
        -0.44777018,  0.27451836],
       [-0.89171648, -0.93670081, -0.26449442, ..., -0.08673072,
        -0.44777018,  0.27451836],
       [ 1.12143267,  0.72777768,  1.11373598, ...,  1.73084355,
        -0.44777018, -2.34563704]])

Model Implementation

In [67]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier()
}

In [68]:
#applying each model and storing results
results = {}
for model_name, model in models.items():
    # Training the model
    model.fit(X_train_scaled, y_train)

Make Predictions

In [47]:
y_pred = model.predict(X_test_scaled)

Model Evaluation

In [50]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [69]:
#Storing the results
results[model_name] = {"accuracy": accuracy, "report": report}

results

{'XGBClassifier': {'accuracy': 0.7136929460580913,
  'report': '              precision    recall  f1-score   support\n\n           0       0.50      0.61      0.55        23\n           1       0.85      0.61      0.71       103\n           2       0.68      0.83      0.75       115\n\n    accuracy                           0.71       241\n   macro avg       0.68      0.68      0.67       241\nweighted avg       0.74      0.71      0.71       241\n'}}

In [70]:
def evaluate_metrics(actual, predict):

    r2  = r2_score(actual, predict )
    f1  = f1_score(actual, predict, average = 'micro')
    accuracy = accuracy_score(actual, predict)



    return (r2, f1, accuracy)

HyperParamater Tuning

In [71]:
# Define each model and parameters for hyperparameter tuning

models = {
    "DecisionTreeClassifier"     :{
        'model' :DecisionTreeClassifier(),
        'param' :{
            'criterion': ['gini', 'entropy'], #'log_loss
            'splitter' : ['best', 'random'],
            'max_depth': [1,2,3,4],
            'min_samples_split': [2,3,4],
            'min_samples_leaf': [1,2,3],
        }
    },
    "LogisticRegression"         : {
        'model' : LogisticRegression(),
        'param' : {
            'C'      : [1, 0.1, 0.01, 0.001],
            'penalty': ['l1', 'l2'], #'elasticnet'
            'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
            'multi_class': ['ovr'],
            'max_iter' : [1000]
            # 'l1_ratio'   : [0.1,0.3,0.5,0.7,0.9]
        }
    },


    "RandomForestClassifier"     : {
        'model' : RandomForestClassifier(),
        'param' : {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': [1,2,3,4],
            'min_samples_split': [2,3,4],
            'min_samples_leaf': [1,2,3],
            'max_features': ['sqrt', 'log2'],
            'class_weight': ['balanced', 'balanced_subsample',None]
        }
    },
    "XGBClassifier"              : {
        'model' : XGBClassifier(),
        'param' : {
            'booster' : [ 'gbtree', 'gblinear' ,'dart'],
            'eta'     : [0.1 ,0.3 , 0.5, 0.7],
            'gamma'   : [0.1, 1, 10],
            'min_child_weight' : [ 1 ,2 ,3 ],
            'lambda'  : [0, 0.1, 0.4, 0.7, 1],
            'verbosity': [0]
        }
    },

  }

In [73]:
from tqdm import tqdm

final_model = {
    'model_name': [],
    'models':[],
    'r2_score':[],
    'f1_score':[],
    'accuracy_score':[],

}
for model_name , model_info in tqdm(models.items()):

    grid = GridSearchCV(model_info["model"],model_info["param"],refit=True, verbose=0)

    grid.fit(X_train,y_train)

    y_pred = grid.predict(X_test)

    r2, f1, accuracy = evaluate_metrics(actual = y_test, predict= y_pred)

    final_model['model_name'        ].append(model_name)
    final_model['models'             ].append(grid.best_estimator_)
    final_model['r2_score'          ].append(r2)
    final_model['f1_score'          ].append(f1)
    final_model['accuracy_score'].append(accuracy)

60 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

-------------------------------------

In [74]:
final_model = pd.DataFrame(final_model)
final_model.sort_values('f1_score', ascending= False , inplace= True)
final_model

Unnamed: 0,model_name,models,r2_score,f1_score,accuracy_score
3,XGBClassifier,"XGBClassifier(base_score=None, booster='gbtree...",0.387634,0.875519,0.875519
2,RandomForestClassifier,"(DecisionTreeClassifier(criterion='log_loss', ...",-0.156691,0.717842,0.717842
1,LogisticRegression,"LogisticRegression(C=0.1, max_iter=1000, multi...",-0.08865,0.684647,0.684647
0,DecisionTreeClassifier,"DecisionTreeClassifier(criterion='entropy', ma...",-0.467734,0.609959,0.609959
