# Import Library

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [104]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import time

# Improved Data

## Load Data

In [107]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
path = '/content/drive/MyDrive/Nhóm 1/2 Bài tập trên lớp/Thực hành Final/Data processed/final_dataset_improved.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,id,name,num_users,total_cmt,positive,negative,neutral,average_completion_rate,number of resources,rank_binned,course_classification
0,C_655852,series of courses-completed courses without re...,48,0.0,0.0,0.0,0.0,0.648872,162,9.0,Dissatisfied
1,C_655850,series of classes - class has not started yet,47,0.0,0.0,0.0,0.0,0.648872,91,1.0,Neutral
2,C_654554,series of courses - now starting,47,0.0,0.0,0.0,0.0,0.648872,99,4.481596,Dissatisfied
3,C_654506,series of courses - the course has ended and t...,47,0.0,0.0,0.0,0.0,0.648872,3,4.481596,Dissatisfied
4,C_629558,medical immunology and pathogen biology,48,0.0,0.0,0.0,0.0,0.648872,91,4.481596,Dissatisfied


In [109]:
df['course_classification'] = df['course_classification'].map({'Very dissatisfied': 0, 'Dissatisfied': 1, 'Neutral': 2, 'Satisfied': 3, 'Very satisfied': 4})

In [110]:
df['course_classification'].value_counts().sort_index()

Unnamed: 0_level_0,count
course_classification,Unnamed: 1_level_1
0,461
1,876
2,585
3,526
4,374


## Data Preparation

In [111]:
X = df.drop(columns=['id',
                     'name',
                     'course_classification',])
y = df['course_classification']

X.shape

(2822, 8)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
scaler = PowerTransformer()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Raw Data

## Load Data

In [114]:
path = '/content/drive/MyDrive/Nhóm 1/2 Bài tập trên lớp/Thực hành Final/Data processed/final_dataset.csv'

df_raw = pd.read_csv(path)
df_raw.head()

Unnamed: 0,id,name,num_users,total_cmt,positive,negative,neutral,average_completion_rate,number of resources,rank_binned,course_classification
0,C_584313,"introduction to ""zi zhi tong jian""",3,,,,,,91,1.0,
1,C_584329,calculus - limit theory and functions of one v...,6,,,,,,170,1.0,
2,C_584329,calculus - limit theory and functions of one v...,6,,,,,,170,1.0,
3,C_584381,photojournalism,5,,,,,,127,1.0,
4,C_597208,data mining: theory and algorithms,10,,,,,,125,1.0,


In [115]:
df_raw['course_classification'] = df_raw['course_classification'].map({'Very dissatisfied': 0, 'Dissatisfied': 1, 'Neutral': 2, 'Satisfied': 3, 'Very satisfied': 4})

In [116]:
df_raw['course_classification'].value_counts().sort_index()

Unnamed: 0_level_0,count
course_classification,Unnamed: 1_level_1
0.0,577
1.0,597
2.0,546
3.0,571
4.0,570


## Data Preparation

In [117]:
df_raw = df_raw.dropna(subset=['course_classification'])
df_raw = df_raw.drop_duplicates(subset='id')

In [118]:
Xr = df_raw.drop(columns=['id',
                     'name',
                     'course_classification',])
yr = df_raw['course_classification']

Xr.shape

(312, 8)

Bộ dữ liệu được chia thành 3 tập là train, validation, test theo tỷ lệ 8:1:1

In [119]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)

In [120]:
scaler = PowerTransformer()
Xr_train = scaler.fit_transform(Xr_train)
Xr_test = scaler.transform(Xr_test)

# Support Vector Machine

## Improved data

In [121]:
model_svm = SVC(random_state=42)
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80        94
           1       0.94      0.93      0.94       165
           2       0.82      0.79      0.81       111
           3       0.69      0.75      0.72       104
           4       0.75      0.70      0.73        91

    accuracy                           0.81       565
   macro avg       0.80      0.80      0.80       565
weighted avg       0.82      0.81      0.81       565



## Raw data

In [122]:
modelr_svm = SVC(random_state=42)
modelr_svm.fit(Xr_train, yr_train)

y_pred_svm_raw = modelr_svm.predict(Xr_test)
print(classification_report(yr_test, y_pred_svm_raw))

              precision    recall  f1-score   support

         0.0       0.75      0.43      0.55        14
         1.0       0.53      0.67      0.59        12
         2.0       0.72      0.76      0.74        17
         3.0       0.38      0.38      0.38         8
         4.0       0.57      0.67      0.62        12

    accuracy                           0.60        63
   macro avg       0.59      0.58      0.57        63
weighted avg       0.62      0.60      0.60        63



# Decision Tree Classifier

## Improved data

In [123]:
model_tree = DecisionTreeClassifier(random_state=42)
model_tree.fit(X_train, y_train)

y_pred_tree = model_tree.predict(X_test)
print(classification_report(y_test, y_pred_tree))

              precision    recall  f1-score   support

           0       0.84      0.83      0.83        94
           1       0.93      0.92      0.93       165
           2       0.85      0.81      0.83       111
           3       0.65      0.73      0.69       104
           4       0.65      0.62      0.63        91

    accuracy                           0.80       565
   macro avg       0.78      0.78      0.78       565
weighted avg       0.80      0.80      0.80       565



## Raw data

In [124]:
modelr_tree = DecisionTreeClassifier(random_state=42)
modelr_tree.fit(Xr_train, yr_train)

y_pred_tree_raw = modelr_tree.predict(Xr_test)
print(classification_report(yr_test, y_pred_tree_raw))

              precision    recall  f1-score   support

         0.0       0.80      0.29      0.42        14
         1.0       0.45      0.75      0.56        12
         2.0       0.61      0.65      0.63        17
         3.0       0.11      0.12      0.12         8
         4.0       0.55      0.50      0.52        12

    accuracy                           0.49        63
   macro avg       0.50      0.46      0.45        63
weighted avg       0.55      0.49      0.48        63



# KNN

## Improved data

In [125]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

y_pred_knn = model_knn.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69        94
           1       0.92      0.94      0.93       165
           2       0.75      0.77      0.76       111
           3       0.55      0.60      0.57       104
           4       0.63      0.53      0.57        91

    accuracy                           0.74       565
   macro avg       0.71      0.71      0.71       565
weighted avg       0.73      0.74      0.73       565



## Raw data

In [126]:
modelr_knn = KNeighborsClassifier()
modelr_knn.fit(Xr_train, yr_train)

y_pred_knn_raw = modelr_knn.predict(Xr_test)
print(classification_report(yr_test, y_pred_knn_raw))

              precision    recall  f1-score   support

         0.0       0.50      0.36      0.42        14
         1.0       0.44      0.58      0.50        12
         2.0       0.63      0.71      0.67        17
         3.0       0.14      0.12      0.13         8
         4.0       0.45      0.42      0.43        12

    accuracy                           0.48        63
   macro avg       0.43      0.44      0.43        63
weighted avg       0.47      0.48      0.47        63



# Random Forest

## Improved data

In [127]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91        94
           1       0.95      0.93      0.94       165
           2       0.86      0.81      0.83       111
           3       0.65      0.84      0.73       104
           4       0.82      0.68      0.74        91

    accuracy                           0.84       565
   macro avg       0.84      0.83      0.83       565
weighted avg       0.85      0.84      0.84       565



## Raw data

In [128]:
modelr_rf = RandomForestClassifier(random_state=42)
modelr_rf.fit(Xr_train, yr_train)

y_pred_rf_raw = modelr_rf.predict(Xr_test)
print(classification_report(yr_test, y_pred_rf_raw))

              precision    recall  f1-score   support

         0.0       0.67      0.43      0.52        14
         1.0       0.47      0.58      0.52        12
         2.0       0.65      0.76      0.70        17
         3.0       0.25      0.25      0.25         8
         4.0       0.45      0.42      0.43        12

    accuracy                           0.52        63
   macro avg       0.50      0.49      0.49        63
weighted avg       0.53      0.52      0.52        63



# LightGBM

## Improved data

In [129]:
model_lgbm = LGBMClassifier(random_state=42)
model_lgbm.fit(X_train, y_train)

y_pred_lgbm = model_lgbm.predict(X_test)
print(classification_report(y_test, y_pred_lgbm))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1539
[LightGBM] [Info] Number of data points in the train set: 2257, number of used features: 8
[LightGBM] [Info] Start training from score -1.816430
[LightGBM] [Info] Start training from score -1.155119
[LightGBM] [Info] Start training from score -1.560584
[LightGBM] [Info] Start training from score -1.676786
[LightGBM] [Info] Start training from score -2.076345
              precision    recall  f1-score   support

           0       0.92      0.91      0.92        94
           1       0.93      0.95      0.94       165
           2       0.89      0.86      0.88       111
           3       0.77      0.82      0.79       104
           4       0.84      0.79      0.81        91

    accuracy                           0.88       565
   macro avg       0.87      0.87      0.87       565
weighted 

## Raw data

In [130]:
modelr_lgbm = LGBMClassifier(random_state=42)
modelr_lgbm.fit(Xr_train, yr_train)

y_pred_lgbm_raw = modelr_lgbm.predict(Xr_test)
print(classification_report(yr_test, y_pred_lgbm_raw))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 402
[LightGBM] [Info] Number of data points in the train set: 249, number of used features: 8
[LightGBM] [Info] Start training from score -1.605430
[LightGBM] [Info] Start training from score -1.528469
[LightGBM] [Info] Start training from score -1.457010
[LightGBM] [Info] Start training from score -1.667305
[LightGBM] [Info] Start training from score -1.828573
              precision    recall  f1-score   support

         0.0       0.67      0.43      0.52        14
         1.0       0.50      0.58      0.54        12
         2.0       0.65      0.76      0.70        17
         3.0       0.33      0.38      0.35         8
         4.0       0.55      0.50      0.52        12

    accuracy                           0.56        63
   macro avg       0.54      0.53      0.53        63
weighted av

# CatBoost

## Improved data

In [131]:
model_cb = CatBoostClassifier(verbose=0)
model_cb.fit(X_train, y_train)

y_pred_cb = model_cb.predict(X_test)
print(classification_report(y_test, y_pred_cb))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87        94
           1       0.97      0.94      0.95       165
           2       0.85      0.90      0.88       111
           3       0.78      0.81      0.79       104
           4       0.84      0.81      0.83        91

    accuracy                           0.87       565
   macro avg       0.86      0.86      0.86       565
weighted avg       0.88      0.87      0.87       565



## Raw data

In [132]:
modelr_cb = CatBoostClassifier(verbose=0)
modelr_cb.fit(Xr_train, yr_train)

y_pred_cb_raw = modelr_cb.predict(Xr_test)
print(classification_report(yr_test, y_pred_cb_raw))

              precision    recall  f1-score   support

         0.0       0.75      0.43      0.55        14
         1.0       0.50      0.67      0.57        12
         2.0       0.82      0.82      0.82        17
         3.0       0.57      0.50      0.53         8
         4.0       0.53      0.67      0.59        12

    accuracy                           0.63        63
   macro avg       0.64      0.62      0.61        63
weighted avg       0.66      0.63      0.63        63

