

# > **Importing necessary dependacies**



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import requests

In [2]:
pd.set_option('display.max_columns',None)           #to see all the columns in the dataset in a time
pd.set_option('display.max_colwidth', None)

**Loading the dataset**

In [3]:
url='https://api.jsonserve.com/XgAgFJ'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data)

**Eliminating the features that are not relavant to analyse students performance**

In [4]:
df.drop(columns=['id','user_id','quiz_id','submitted_at','created_at','updated_at','type','response_map'],axis=1,inplace=True)

# **FEATURE EXTRACTION**

**Converting the start and end time into time taken by the student to complete the test and creating a new feature of time consumed by the student in percentage**

In [5]:
from datetime import datetime
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
#Finding the time period in minutes
df['time'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60
#Converting the time period out of total time 15 min into percentage format
df['time_taken'] = (df['time']/15)*100
df.drop(columns=['started_at','ended_at','time','duration'],axis=1,inplace=True)

**The given accuracy is not relavant to the final score of the students so the accuracy column is updated using final score and score details provided**

In [6]:
df.drop(columns=['accuracy'],axis=1,inplace=True)
df['final_score'] = pd.to_numeric(df['final_score'], errors='coerce').astype('Int64')
df['accuracy']=(df['final_score']/df['score'])*100
df.drop(columns=['final_score','score'],axis=1,inplace=True)

**The rank is converted in nummerical format**

In [7]:
df['rank'] = df['rank_text'].str.extract(r'#([-\d]+)').astype(int).abs()
df.drop(columns=['rank_text'],axis=1,inplace=True)

In [8]:
df['title'] = df['quiz'].apply(lambda x: x['title'] if isinstance(x, dict) else None)
df['topic'] = df['quiz'].apply(lambda x: x['topic'] if isinstance(x, dict) else None)
df.drop(columns=['quiz'],axis=1,inplace=True)

**processing model for weak areas analysis of students**

In [9]:
from sklearn.cluster import KMeans
weak_areas = df[['correct_answers', 'incorrect_answers', 'mistakes_corrected']]

# Initialize and fit KMeans
kmeans_weak = KMeans(n_clusters=3, random_state=42)
weak_areas['cluster'] = kmeans_weak.fit_predict(weak_areas)

# Analyze clusters
print(weak_areas.groupby('cluster').mean())


         correct_answers  incorrect_answers  mistakes_corrected
cluster                                                        
0               8.333333               12.0            0.000000
1              15.400000                0.8            1.600000
2              28.000000                2.0           10.666667


**processing model for areas of improvements analysis of students**

In [10]:
from sklearn.mixture import GaussianMixture
improvements = df[['accuracy', 'rank']]

# Fit Gaussian Mixture Model
gmm_improvements = GaussianMixture(n_components=3, random_state=42)
improvements['cluster'] = gmm_improvements.fit_predict(improvements)

# Analyze clusters
print(improvements.groupby('cluster').mean())

          accuracy         rank
cluster                        
0        72.261905  2051.000000
1        98.381696  7149.500000
2        87.889282   296.666667


**processing model for finding performance gap of students**

In [11]:
performance_gaps = df[['negative_score', 'initial_mistake_count', 'total_questions', 'rank']]

performance_gaps['negative_score']=pd.to_numeric(df['negative_score'], errors='coerce').astype('Int64')
kmeans_pg = KMeans(n_clusters=2, random_state=42)
performance_gaps['cluster'] = kmeans_pg.fit_predict(performance_gaps)
# Analyze clusters
print(performance_gaps.groupby('cluster').mean())

         negative_score  initial_mistake_count  total_questions    rank
cluster                                                                
0                   7.7                    9.8             67.1  1524.7
1                  1.25                    6.0             26.5  7149.5


**processing model for recomendations of students**

In [12]:
# Select features
recommendations = df[['better_than', 'trophy_level']]

# Fit DBSCAN
model_recommendations = GaussianMixture(n_components=3, random_state=42)
recommendations['cluster'] = model_recommendations.fit_predict(recommendations)

# Analyze clusters
print(recommendations.groupby('cluster').mean())


         better_than  trophy_level
cluster                           
0              29.75          2.75
1             331.50          2.00
2             132.50          2.00


**processing model for student persona of students**

In [13]:
student_persona = df[['speed', 'rank', 'source']]

# Encode categorical data if present
student_persona['source'] = student_persona['source'].astype('category').cat.codes
student_persona['speed']=pd.to_numeric(student_persona['speed'], errors='coerce').astype('Int64')
# Fit KMeans
kmeans_persona = KMeans(n_clusters=3, random_state=42)
student_persona['cluster'] = kmeans_persona.fit_predict(student_persona)

# Analyze clusters
print(student_persona.groupby('cluster').mean())

             speed         rank    source
cluster                                  
0        92.857143  2051.000000  0.428571
1            95.25  7149.500000  0.250000
2            100.0   296.666667  0.333333


**Saving the trained model for its deployment**

In [14]:
import pickle

# Example models (replace with your actual models)
models = {
    "weak_areas_model": kmeans_weak,           # KMeans for Weak Areas
    "improvements_model": gmm_improvements,   # Gaussian Mixture for Improvements
    "performance_gaps_model": kmeans_pg,      # KMeans for Performance Gaps
    "recommendations_model": model_recommendations,  # Agglomerative Clustering for Recommendations
    "student_persona_model": kmeans_persona   # KMeans for Student Persona
}

# Save each model as a .pt file
for model_name, model in models.items():
    file_name = f"{model_name}.pt"
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)

print("Models saved successfully in .pt format!")

Models saved successfully in .pt format!
