In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
df = pd.read_csv('/content/students_adaptability_level_online_education.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               1205 non-null   object
 1   Age                  1205 non-null   object
 2   Education Level      1205 non-null   object
 3   Institution Type     1205 non-null   object
 4   IT Student           1205 non-null   object
 5   Location             1205 non-null   object
 6   Load-shedding        1205 non-null   object
 7   Financial Condition  1205 non-null   object
 8   Internet Type        1205 non-null   object
 9   Network Type         1205 non-null   object
 10  Class Duration       1205 non-null   object
 11  Self Lms             1205 non-null   object
 12  Device               1205 non-null   object
 13  Adaptivity Level     1205 non-null   object
dtypes: object(14)
memory usage: 131.9+ KB


In [None]:
df.head()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,Boy,21-25,University,Non Government,No,Yes,Low,Mid,Wifi,4G,3-6,No,Tab,Moderate
1,Girl,21-25,University,Non Government,No,Yes,High,Mid,Mobile Data,4G,1-3,Yes,Mobile,Moderate
2,Girl,16-20,College,Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Moderate
3,Girl,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
4,Girl,16-20,School,Non Government,No,Yes,Low,Poor,Mobile Data,3G,0,No,Mobile,Low


In [None]:
df.nunique()

Unnamed: 0,0
Gender,2
Age,6
Education Level,3
Institution Type,2
IT Student,2
Location,2
Load-shedding,2
Financial Condition,3
Internet Type,2
Network Type,3


In [None]:
LE = LabelEncoder()
label_cols = ['Gender', 'Institution Type', 'IT Student', 'Location', 'Load-shedding', 'Internet Type', 'Self Lms']
for cols in label_cols:
  df[cols] = LE.fit_transform(df[cols])

In [None]:
df = pd.get_dummies(df, columns=['Device'], drop_first=True)

In [None]:
OE = OrdinalEncoder()
ordinal_cols = ['Age', 'Education Level', 'Financial Condition', 'Network Type', 'Class Duration', 'Adaptivity Level']
df[ordinal_cols] = OE.fit_transform(df[ordinal_cols])

In [None]:
df.head()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Adaptivity Level,Device_Mobile,Device_Tab
0,0,3.0,2.0,1,0,1,1,0.0,1,2.0,2.0,0,2.0,False,True
1,1,3.0,2.0,1,0,1,0,0.0,0,2.0,1.0,1,2.0,True,False
2,1,2.0,0.0,0,0,1,1,0.0,1,2.0,1.0,0,2.0,True,False
3,1,1.0,1.0,1,0,1,1,0.0,0,2.0,1.0,0,2.0,True,False
4,1,2.0,1.0,1,0,1,1,1.0,0,1.0,0.0,0,1.0,True,False


In [None]:
X = df.drop(columns=['Adaptivity Level'])
y = df['Adaptivity Level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
knn_y_pred = knn.predict(X_test)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
dt_y_pred = dt.predict(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
gnb_y_pred = gnb.predict(X_test)

In [None]:
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

In [None]:
rf_y_pred = rf.predict(X_test)

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
svc_y_pred = svc.predict(X_test)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
lr_y_pred = lr.predict(X_test)

In [None]:
models = {
    'KNN' : knn_y_pred,
    'DecisionTree' : dt_y_pred,
    'GaussianNB' : gnb_y_pred,
    'RandomForest' : rf_y_pred,
    'SVC' : svc_y_pred,
    'LogisticRegression' : lr_y_pred
}

for name, y_pred in models.items():
  print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
  print('\n', classification_report(y_test, y_pred))

KNN Accuracy: 0.7925

               precision    recall  f1-score   support

         0.0       0.69      0.48      0.56        23
         1.0       0.81      0.84      0.83       103
         2.0       0.79      0.81      0.80       115

    accuracy                           0.79       241
   macro avg       0.76      0.71      0.73       241
weighted avg       0.79      0.79      0.79       241

DecisionTree Accuracy: 0.9004

               precision    recall  f1-score   support

         0.0       0.83      0.65      0.73        23
         1.0       0.93      0.92      0.93       103
         2.0       0.88      0.93      0.91       115

    accuracy                           0.90       241
   macro avg       0.88      0.83      0.86       241
weighted avg       0.90      0.90      0.90       241

GaussianNB Accuracy: 0.6307

               precision    recall  f1-score   support

         0.0       0.62      0.57      0.59        23
         1.0       0.58      0.81      0.67 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_train_pred = dt.predict(X_train)
print(accuracy_score(y_train, y_train_pred))

0.9346473029045643


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],  # splitting criteria
    "max_depth": [None, 5, 10, 15, 20],            # depth of tree
    "min_samples_split": [2, 5, 10],               # min samples to split a node
    "min_samples_leaf": [1, 2, 5],                 # min samples at leaf node
    "max_features": [None, "sqrt", "log2"]         # number of features to consider
}

# Grid Search
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1            # use all processors
)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best params and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Evaluate on test set
best_dt = grid_search.best_estimator_
test_acc = best_dt.score(X_test, y_test)
print("Test Accuracy after tuning:", test_acc)

Best Parameters: {'criterion': 'log_loss', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV Score: 0.9128562176165802
Test Accuracy after tuning: 0.8796680497925311


In [None]:
print(best_dt.score(X_train, y_train))

0.9346473029045643
