# Students performance in elearning courses

## Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
import category_encoders as ce

## Uploading Dataset

In [None]:
viewData = pd.read_csv('elearn.csv')

In [None]:
viewData

In [None]:
# Dropping null values
viewData=viewData.dropna(axis=0)

## Encoding Values

In [None]:
# showing target values
viewData['Your cumulative average (GPA)'].unique()

In [None]:
# OrdinalEncoder function maps every string data to its equivalent numeric data for easy analysis
encoder = ce.OrdinalEncoder(cols=['Your cumulative average (GPA)'],return_df=True,
                           mapping=[{'col':'Your cumulative average (GPA)',
'mapping':{'Below 60 / Below 2.0':0,'60-69 / 2-2.49':1,'60-69 / 2-2.9':1, '70-79 / 2.5-299':2, '80-89 / 3-3.49':3,'+90 / +3.5':4, '60-69':1, '80-89':3, '+90':4,
       '70-79':2, 'Below 60':0}}])

In [None]:
# replaces the data with encoded values
viewData = encoder.fit_transform(viewData)

In [None]:
viewData['Gender'].unique()

In [None]:
# deleting column Gender and creating it a new column 'Male or not'
viewData['Male'] = viewData['Gender'].map({'Male':1, 'Female':0})
viewData = viewData.drop(['Gender'], axis=1)

In [None]:
viewData['Level/Year'].unique()

In [None]:
# correcting field values of 'Level/Year'
roles={'Fourth':'Fourth/Senior', 'Third':'Third/Junior', 'Second':'Second/ Sophomore', 'First':'First/Freshman'}
viewData = viewData.replace(roles)

In [None]:
# creating dummy fields/columns of above
dummy = pd.get_dummies(data=viewData['Level/Year'])

In [None]:
# removing duplicate values and deleting the original column
viewData = pd.concat([viewData, dummy], axis=1)
viewData = viewData.drop(['Level/Year'], axis=1)

In [None]:
viewData['Age'].unique()

In [None]:
# creating dummy fields of column 'Age' and deleting the original column 
ageDummy = pd.get_dummies(data=viewData['Age'])
viewData = pd.concat([viewData, ageDummy], axis=1)
viewData = viewData.drop(['Age'], axis=1)

In [None]:
viewData['Before COVID-19: Which of the following digital tools do you usually use?'].unique()

In [None]:
# creating dummy fields of column mentioned below in integer type and deleting the original column
beforeDummy = pd.get_dummies(data=viewData['Before COVID-19: Which of the following digital tools do you usually use?'], prefix='before').astype(int)
viewData = pd.concat([viewData, beforeDummy], axis=1)
viewData = viewData.drop('Before COVID-19: Which of the following digital tools do you usually use?', axis=1)

In [None]:
# creating dummy fields of column mentioned below in integer type and deleting the original column
afterDummy = pd.get_dummies(data=viewData['After COVID-19: Which of the following digital tools do you usually use?'], prefix='after').astype(int)
viewData = pd.concat([viewData, afterDummy], axis=1)
viewData = viewData.drop(['After COVID-19: Which of the following digital tools do you usually use?'], axis=1)

In [None]:
viewData

In [None]:
viewData['Before COVID-19: How much time do you spend using the digital tools in learning?'].unique()

In [None]:
viewData['The distance learning system, caused by the COVID-19 epidemic, resulted in social distancing.'].unique()

In [None]:
# taking all possible opinion values in opinval and time spent values in timeval
opinval = {'Strongly Disagree':0, 'Disagree':1, 'Uncertain\t':2, 'Agree\t':3, 'Strongly Agree':4, 'Agree':3, 'Uncertain':2, 'Strongly agree':4,'Strongly disagree':0 }
timeval = {'3-6':1, '6-9':2, '9-12':3, '+12':4, '1-3':0, '1-3 ':0}

In [None]:
# replacing all the opinion values and time spent in data storing variable 'viewData'
viewData = viewData.replace(opinval)
viewData = viewData.replace(timeval)

In [None]:
# removing all null values and displaying data
viewData=viewData.dropna(axis=0)
viewData.head()

# Data Analysis

In [None]:
plt.figure(figsize=(15, 15))
grouped_data = viewData['Before COVID-19: How much time do you spend using the digital tools in learning?'].value_counts()
grouped_data.plot(kind='pie', autopct='%1.1f%%', startangle=90, title='Before COVID-19: Time spent using digital tools in learning')
plt.axis('equal')
plt.legend(labels=grouped_data.index, loc='best')
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
grouped_data = viewData['After COVID-19: How much time do you spend using the digital tools in learning?'].value_counts()
grouped_data.plot(kind='pie', autopct='%1.1f%%', startangle=90, title='After COVID-19: Time spent using digital tools in learning')
plt.axis('equal')
plt.legend(labels=grouped_data.index, loc='best')
plt.show()

## Kernel Density Estimation (KDE)

In [None]:
testview = viewData

In [None]:
fig = plt.figure(figsize=[15,5])
plt.tight_layout()
for i in range(2):
    fig.add_subplot(1,  2, i+1)
    sns.kdeplot(data=testview,x=testview.columns[i+1],hue='Your cumulative average (GPA)')
    if i == 16:
        plt.xlim([-50,300])
    sns.despine()

In [None]:
# Create a custom color palette
custom_palette = sns.color_palette(['red', 'blue', 'green', 'purple', 'orange'])

# Assuming 'Your cumulative average (GPA)' has discrete values for different categories
plt.figure(figsize=(10, 10))

# Assign 'Your cumulative average (GPA)' to both x and hue, and set legend=False
gplot = sns.countplot(data=testview, x='Your cumulative average (GPA)', hue='Your cumulative average (GPA)', palette=custom_palette, legend=False)

# Annotate the plot
for p in gplot.patches:
    gplot.annotate(format(p.get_height(), '.1f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 9),
                   textcoords='offset points')

plt.show()

## Dropping certain columns

In [None]:
# Based on the correaltion coefficients.
# Closer the value to 0, lesser effect it is having on the target value.
# This is done is order to prevent overfitting.
viewData = viewData.drop(['after_Laptop', 'Third/Junior'], axis=1)

In [None]:
viewData = viewData.dropna(axis=0)
viewData

## Correlation Coefficient / Regression Data

In [None]:
# viewDataCorr is your correlation matrix
viewDataCorr = viewData.corr()

# Get the top 10 correlations with GPA
top_corr_features = viewDataCorr.nlargest(10, 'Your cumulative average (GPA)').index

# Extract the top 10 correlation matrix
top_corr = viewData[top_corr_features].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(top_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Top 10 Correlations with GPA')
plt.show()

In [None]:
# viewDataCorr is your correlation matrix
viewDataCorr = viewData.corr()

# Get the top 20 correlations with GPA
top_corr_features = viewDataCorr.nlargest(20, 'Your cumulative average (GPA)').index

# Extract the top 10 correlation matrix
top_corr = viewData[top_corr_features].corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(top_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Top 20 Correlations with GPA')
plt.show()

## Test and Train Data

In [None]:
# Removing CGPA scores of the less than 60 or '0' as there are only 4 rows
testData = viewData[viewData['Your cumulative average (GPA)']>0]
testData

In [None]:
# Dividing the datset into parameters columns (X) and target value (y)
X=testData.iloc[:, 1:]
y=testData.iloc[:, [0]]

In [None]:
# Applying SMOTE to get rid of class imbalance
from imblearn.over_sampling import SMOTE


oversample = SMOTE()
X, y = oversample.fit_resample(X, y) # X is your feature matrix and y is your target variable

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

## Classification methods

### SVM (Support Vector Machine)

In [None]:
from sklearn.svm import SVC

model = SVC()
ovo = OneVsOneClassifier(model)
# Extract the underlying NumPy array from the DataFrame and then use ravel()
ovo.fit(X_train, y_train.values.ravel())

In [None]:
# predicting score using support vector machine
prd=ovo.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(prd, y_test))

In [None]:
# May come different on your device, depending on sklearn version or inbuilt random number that model assigns to training set
print(classification_report(prd,y_test))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()

ovo = OneVsOneClassifier(model)
ovo.fit(X_train, y_train.values.ravel())

In [None]:
# predicting score using decision tree
pred=ovo.predict(X_test)
print(classification_report(y_test, pred))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rd=RandomForestClassifier(n_estimators=200) # Can change value

ovo = OneVsOneClassifier(model)
ovo.fit(X_train, y_train.values.ravel())

In [None]:
# predicting score using random forest
rd_pred=ovo.predict(X_test)
print(classification_report(rd_pred, y_test))

### KNN (K-Nearest Neighbors)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=15)

In [None]:
ovo = OneVsOneClassifier(neigh)
ovo.fit(X_train, y_train.values.ravel())

In [None]:
# predicting score using K-Nearest Neighbors
y_pred=ovo.predict(X_test)
print(classification_report(y_pred, y_test))