In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report

In [74]:
df=pd.read_csv("online_course_engagement_data.csv")
df

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.802640,1,5,62.615970,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0
...,...,...,...,...,...,...,...,...,...
8995,8757,Health,37.445225,14,4,54.469359,32.990704,1,0
8996,894,Science,48.631443,7,7,59.413257,0.254625,0,0
8997,6323,Health,38.212512,3,3,69.508297,70.188159,1,0
8998,3652,Health,70.048665,13,10,79.655182,72.975225,1,1


In [75]:
num_var=[features for features in df.columns if df[features].dtype!="O"]
cat_var=[features for features in df.columns if df[features].dtype=="O"]
print(f"Numerical Variable : {num_var}\nCategorical Variable : {cat_var}")

Numerical Variable : ['UserID', 'TimeSpentOnCourse', 'NumberOfVideosWatched', 'NumberOfQuizzesTaken', 'QuizScores', 'CompletionRate', 'DeviceType', 'CourseCompletion']
Categorical Variable : ['CourseCategory']


In [76]:
le=LabelEncoder()
df["CourseCategory"]=le.fit_transform(df["CourseCategory"])

In [77]:
correlation=df.corr()
correlation

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
UserID,1.0,-0.013553,0.009134,0.018368,-0.001164,-0.010824,-0.006239,0.006026,0.021093
CourseCategory,-0.013553,1.0,0.003126,0.00375,0.008966,-0.016746,-0.008146,0.012397,-0.006694
TimeSpentOnCourse,0.009134,0.003126,1.0,0.019833,-0.012562,-0.006152,0.020934,0.004144,0.182553
NumberOfVideosWatched,0.018368,0.00375,0.019833,1.0,-0.020059,-0.017497,0.016257,0.007403,0.23436
NumberOfQuizzesTaken,-0.001164,0.008966,-0.012562,-0.020059,1.0,-0.012295,0.006892,0.015973,0.284504
QuizScores,-0.010824,-0.016746,-0.006152,-0.017497,-0.012295,1.0,-0.012303,0.004225,0.299983
CompletionRate,-0.006239,-0.008146,0.020934,0.016257,0.006892,-0.012303,1.0,-0.004767,0.326139
DeviceType,0.006026,0.012397,0.004144,0.007403,0.015973,0.004225,-0.004767,1.0,0.007551
CourseCompletion,0.021093,-0.006694,0.182553,0.23436,0.284504,0.299983,0.326139,0.007551,1.0


In [78]:
category_counts=df["CourseCompletion"].value_counts()

In [79]:
percentaged_of_0=(category_counts[0]/len(df["CourseCompletion"]))*100
percentaged_of_1=(category_counts[1]/len(df["CourseCompletion"]))*100
percentaged_of_0,percentaged_of_1

(60.355555555555554, 39.644444444444446)

In [80]:
y=np.array(df["CourseCompletion"])

In [81]:
x=df.drop(columns=["CourseCompletion","UserID"])


In [82]:
smote=SMOTE(sampling_strategy="minority")
x,y=smote.fit_resample(x,y)

In [83]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [85]:
clf=RandomForestClassifier(n_estimators=100,random_state=42)
clf.fit(x_train,y_train)

In [86]:
y_pred=clf.predict(x_test)

In [87]:
print(y_test.dtype,y_pred.dtype)

int64 int64


In [88]:
precision=precision_score(y_test,y_pred)
cm=confusion_matrix(y_test,y_pred)
report=classification_report(y_test,y_pred)
score=clf.score(x_test,y_test)
print(f"precision score : {precision}\nscore : {score}")
print(f"confusion matrix : \n{cm}\nReport :\n {report}")

precision score : 0.9760306807286673
score : 0.9572020248504371
confusion matrix : 
[[1062   25]
 [  68 1018]]
Report :
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      1087
           1       0.98      0.94      0.96      1086

    accuracy                           0.96      2173
   macro avg       0.96      0.96      0.96      2173
weighted avg       0.96      0.96      0.96      2173



In [89]:
import joblib
joblib.dump(clf,"course_engagement_clf_model.pkl")

['course_engagement_clf_model.pkl']

In [90]:
model=joblib.load("course_engagement_clf_model.pkl")