<a href="https://colab.research.google.com/github/Nourelimanehed/user_classification_comp/blob/main/UserClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the relevant libraries

In [1]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np

# Data Preprocessing

### Importing the Database

In [None]:
raw_data = pd.read_csv('ml_datasource.csv')
raw_data.head()

In [None]:
data = raw_data.copy()

### Removing Outliers

In [None]:

sns.reset_orig()
sns.set(font_scale=1.5)
fig, axes = plt.subplots(3, 2, figsize=(20,20))
sns.kdeplot(data=data['days_on_platform'], ax=axes[0,0])
sns.kdeplot(data=data['minutes_watched'], ax=axes[0,1])
sns.kdeplot(data=data['courses_started'], ax=axes[1,0])
sns.kdeplot(data=data['practice_exams_started'], ax=axes[1,1])
sns.kdeplot(data=data['practice_exams_passed'], ax=axes[2,0])
sns.kdeplot(data=data['minutes_spent_on_exams'], ax=axes[2,1]);

plt.show()

In [None]:
data_no_outliers = data[(data['minutes_watched'] <= 1000)
                            & (data['courses_started']<=10)
                            & (data['practice_exams_started']<=10)
                            & (data['minutes_spent_on_exams']<=40)]

In [None]:
# Reset any modifications to the plotting context (sns) made via seaborn
sns.reset_orig()
sns.set(font_scale=1.5)
fig, axes = plt.subplots(3, 2, figsize=(20,20))

sns.kdeplot(data=data_no_outliers['days_on_platform'], ax=axes[0,0])
sns.kdeplot(data=data_no_outliers['minutes_watched'], ax=axes[0,1])
sns.kdeplot(data=data_no_outliers['courses_started'], ax=axes[1,0])
sns.kdeplot(data=data_no_outliers['practice_exams_started'], ax=axes[1,1])
sns.kdeplot(data=data_no_outliers['practice_exams_passed'], ax=axes[2,0])
sns.kdeplot(data=data_no_outliers['minutes_spent_on_exams'], ax=axes[2,1]);

plt.show()

### Checking for Multicollinearity

In [None]:
data_no_outliers.columns.to_numpy()

In [None]:
# Selecting the numerical columns for Variance Inflation Factor (VIF) calculation
variables = data_no_outliers[['days_on_platform',
                              'minutes_watched',
                              'courses_started',
                              'practice_exams_started',
                              'practice_exams_passed',
                              'minutes_spent_on_exams']]


vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.to_numpy(), i) for i in range(variables.shape[1])]
vif['features'] = variables.columns

vif

In [None]:

data_no_mult = data_no_outliers.drop('practice_exams_started', axis = 1)
data_no_mult.head()

In [None]:
# Selecting specific columns for new Variance Inflation Factor (VIF) calculation
variables = data_no_outliers[['days_on_platform',
                              'minutes_watched',
                              'courses_started',
                              'practice_exams_passed',
                              'minutes_spent_on_exams']]

# Computing the new VIF values for each selected feature
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.to_numpy(), i) for i in range(variables.shape[1])]
vif["features"] = variables.columns
vif

### Dealing with NaN Values

In [None]:
data_no_mult.isnull().sum()

In [None]:
data_no_mult.loc[ data_no_mult['student_country'].isna()]

In [None]:
data_no_nulls = data_no_mult.fillna('NAM', axis = 1)

In [None]:
data_no_nulls.loc[ data_no_nulls['student_country'] == 'NAM', 'student_country']

In [None]:
data_no_nulls.isnull().sum()

### Splitting the Data

In [None]:
inputs = data_no_nulls.drop(['purchased'],axis=1)
target = data_no_nulls['purchased']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(inputs,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=365,
                                                    stratify = target)

In [None]:
x_train.head()

### Encoding the Data

In [None]:
enc = OrdinalEncoder(handle_unknown = 'use_encoded_value',
                     unknown_value = 170);

In [None]:

x_train['student_country_enc'] = enc.fit_transform(x_train['student_country'].to_numpy().reshape(-1, 1));
x_test['student_country_enc'] = enc.transform(x_test['student_country'].to_numpy().reshape(-1, 1));

# Dropping the original 'student_country' column after encoding
x_train = x_train.drop('student_country', axis = 1)
x_test = x_test.drop('student_country', axis = 1)


x_train.head()

In [None]:

# y_train values are converted to integers and x_train values are converted to floating-point numbers.

x_train_array = np.asarray(x_train, dtype = 'float')
y_train_array = np.asarray(y_train, dtype = 'int')

x_test_array = np.asarray(x_test, dtype = 'float')
y_test_array = np.asarray(y_test, dtype = 'int')

# Creating a Logistic Regression Model

In [None]:
# Creating a logistic regression model using the sm.Logit function
log_reg = sm.Logit(y_train_array, x_train_array)
log_reg_results = log_reg.fit()
log_reg_results.summary()

In [None]:
# Generating predictions on the test set and rounding the predictions to nearest integer (0 or 1)
y_test_pred_log_reg = [round(log_reg_results.predict(x_test_array)[i], 0)
                       for i in range(len(y_test_array))]

In [None]:
sns.reset_orig()
ConfusionMatrixDisplay.from_predictions(
    y_test_array, y_test_pred_log_reg,
    cmap = 'magma'
);

plt.show()

# Creating a K-Nearest Neighbors Model

In [None]:
parameters_knn = {'n_neighbors':range(1, 51),  'weights':['uniform', 'distance']}

In [None]:
grid_search_knn = GridSearchCV(estimator = KNeighborsClassifier(),
                               param_grid = parameters_knn,
                               scoring = 'accuracy')

In [None]:
grid_search_knn.fit(x_train_array, y_train_array)

In [None]:
grid_search_knn.best_params_, grid_search_knn.best_score_

In [None]:
# Store the best estimator (model with optimal parameters) in knn_clf
knn_clf = grid_search_knn.best_estimator_

knn_clf

In [None]:

y_test_pred_knn = knn_clf.predict(x_test_array)
sns.reset_orig()
#confusion matrix
ConfusionMatrixDisplay.from_predictions(
    y_test_array, y_test_pred_knn,
    labels = knn_clf.classes_,
    cmap = 'magma'
);

plt.show()

In [None]:

print(classification_report(y_test_array,
                            y_test_pred_knn,
                            target_names = ['0', '1']))

# Creating a Support Vector Machines Model

In [None]:

scaling = MinMaxScaler(feature_range=(-1,1))
x_train_array_svc = scaling.fit_transform(x_train_array)
x_test_array_svc = scaling.transform(x_test_array)

In [None]:

parameters_svc = {'kernel':['linear', 'poly', 'rbf'],
                  'C':range(1, 11),
                  'gamma': ['scale', 'auto']}

In [None]:

grid_search_svc = GridSearchCV(estimator = SVC(),
                               param_grid = parameters_svc,
                               scoring = 'accuracy')

In [None]:

grid_search_svc.fit(x_train_array_svc, y_train_array)

In [None]:

grid_search_svc.best_estimator_

In [None]:

svc_clf = grid_search_svc.best_estimator_

In [None]:

y_test_pred_svc = svc_clf.predict(x_test_array_svc)
sns.reset_orig()
ConfusionMatrixDisplay.from_predictions(
    y_test_array, y_test_pred_svc,
    labels = svc_clf.classes_,
    cmap = 'magma'
);
plt.show()

In [None]:

print(classification_report(y_test_array,
                            y_test_pred_svc,
                            target_names = ['0', '1']))

# Creating a Decision Trees Model

In [None]:
#hyperparam
parameters_dt = {'ccp_alpha':[0,
                              0.001,
                              0.002,
                              0.003,
                              0.004,
                              0.005]}

In [None]:
grid_search_dt = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 365),
                              param_grid = parameters_dt,
                              scoring = 'accuracy')

In [None]:
grid_search_dt.fit(x_train_array, y_train_array)

In [None]:
grid_search_dt.best_estimator_

In [None]:
dt_clf = grid_search_dt.best_estimator_

In [None]:

plt.figure(figsize=(15,10))
plot_tree(dt_clf,
          filled=True,
          feature_names = ['Days on platform',
                           'Minutes watched',
                           'Courses started',
                           'Practice exams passed',
                           'Time spent on exams',
                           'Student country encoded'],
          class_names = ['Will not purchase',
                         'Will purchase'])

plt.show()

In [None]:
y_test_pred_dt = dt_clf.predict(x_test_array)

In [None]:

sns.reset_orig()
ConfusionMatrixDisplay.from_predictions(
    y_test_array, y_test_pred_dt,
    labels = dt_clf.classes_,
    cmap = 'magma'
);

plt.show()

In [None]:
#classif report
print(classification_report(y_test_array, y_test_pred_dt))

# Creating a Random Forests Model

In [None]:

rf_clf = RandomForestClassifier(ccp_alpha = 0.0001, random_state = 365)

In [None]:

rf_clf.fit(x_train_array, y_train_array)

In [None]:

y_test_pred_rf = rf_clf.predict(x_test_array)

In [None]:

sns.reset_orig()
ConfusionMatrixDisplay.from_predictions(
    y_test_array, y_test_pred_rf,
    labels = rf_clf.classes_,
    cmap = 'magma'
);

plt.show()

In [None]:
print(classification_report(y_test_array, y_test_pred_rf))