## Grouping students based on their performance using K Means Clustering

In [700]:
# import all the required libraries
import numpy as np
import pandas as pd
import random

# Setting a seed
random.seed(42)

In [701]:
# obtain the raw csv 
data = "../assets/data/student_marksheet.csv"
marksheet_df = pd.read_csv(data)

In [702]:
print(marksheet_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       250 non-null    int64 
 1   Name     250 non-null    object
 2   Gender   250 non-null    object
 3   Age      250 non-null    int64 
 4   Section  250 non-null    object
 5   Science  250 non-null    int64 
 6   English  250 non-null    int64 
 7   History  250 non-null    int64 
 8   Maths    250 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 17.7+ KB
None


In [703]:
print(marksheet_df.head())

   id     Name  Gender  Age Section  Science  English  History  Maths
0   1  Bronnie  Female   13       C       21       81       62     49
1   2   Lemmie    Male   15       B       29       41       17     40
2   3    Danya  Female   14       C       12       87       16     96
3   4    Denna  Female   14       B       15       53       82     33
4   5  Jocelin    Male   14       A       43        6        3     21


In [704]:
# Identifying categorical features
print(marksheet_df["Gender"].unique())
print(marksheet_df["Section"].unique())
print(marksheet_df["Age"].unique())

['Female' 'Male']
['C' 'B' 'A']
[13 15 14]


In [None]:
# Removing gender and sec column because it will not be used to group students
marksheet_df.drop(["Gender"], axis=1, inplace=True)

In [692]:
marksheet_df["Gender"].replace({"Female": 0, "Male": 1}, inplace=True)
marksheet_df["Section"].replace({"A": 0, "B": 1, "C": 2}, inplace=True)
marksheet_df["Age"].replace({"13": 0, "14": 1, "15": 2}, inplace=True)

In [693]:
print(marksheet_df.shape)
print(marksheet_df.head())

(250, 8)
   id     Name  Age  Section  Science  English  History  Maths
0   1  Bronnie   13        2       21       81       62     49
1   2   Lemmie   15        1       29       41       17     40
2   3    Danya   14        2       12       87       16     96
3   4    Denna   14        1       15       53       82     33
4   5  Jocelin   14        0       43        6        3     21


In [694]:
# # Normalizing the data
# normalizable_cols = ["Age", "Section", "Science", "English", "History", "Maths"]
# X_mean = []
# X_std = []

# for col in normalizable_cols:
#     mean = np.mean(marksheet_df[col])
#     std = np.std(marksheet_df[col])

#     marksheet_df[col] = (marksheet_df[col] - mean)/std

#     X_mean.append(mean)
#     X_std.append(std)

#     print(f'Mean value of column {col}: {mean}')
#     print(f'Standard deviation of column {col}: {std}')

In [695]:
marksheet_df.head()

Unnamed: 0,id,Name,Age,Section,Science,English,History,Maths
0,1,Bronnie,13,2,21,81,62,49
1,2,Lemmie,15,1,29,41,17,40
2,3,Danya,14,2,12,87,16,96
3,4,Denna,14,1,15,53,82,33
4,5,Jocelin,14,0,43,6,3,21


In [696]:
# In this scenario, we aim to create groups of students according to their performance.
K = 50

In [697]:
# Initialize the centroids randomly
centroids = set()

while(len(centroids) != K):
    random_index = random.randint(0, len(marksheet_df)-1)
    z = tuple(marksheet_df.iloc[random_index, -6:])
    centroids.add(z)

# Convert the set to a list
centroids = list(centroids)

# print("Initial Centroids: ")
# for i in range(K):
#     print(centroids[i])

Initial Centroids: 
(13, 0, 7, 29, 30, 23)
(14, 0, 50, 22, 50, 78)
(13, 0, 17, 99, 29, 85)
(14, 2, 22, 26, 1, 28)
(14, 2, 58, 60, 7, 90)
(13, 1, 55, 99, 8, 72)
(14, 0, 82, 4, 56, 22)
(14, 2, 95, 56, 4, 100)
(13, 0, 76, 96, 67, 17)
(15, 2, 54, 89, 59, 88)
(13, 1, 81, 25, 99, 13)
(15, 1, 34, 74, 83, 81)
(15, 0, 22, 86, 9, 38)
(15, 0, 37, 57, 48, 75)
(15, 2, 95, 32, 91, 48)
(14, 2, 72, 51, 50, 11)
(14, 1, 61, 89, 70, 68)
(13, 1, 1, 4, 68, 65)
(13, 2, 36, 95, 55, 46)
(14, 2, 12, 87, 16, 96)
(15, 1, 43, 4, 40, 93)
(13, 0, 81, 45, 22, 36)
(14, 1, 53, 53, 49, 43)
(15, 0, 16, 59, 18, 64)
(15, 2, 91, 33, 71, 81)
(14, 2, 38, 66, 53, 32)
(15, 1, 16, 4, 49, 1)
(15, 0, 66, 20, 76, 30)
(14, 0, 80, 34, 85, 71)
(15, 0, 1, 90, 42, 94)
(14, 0, 64, 1, 74, 6)
(15, 2, 52, 30, 14, 24)
(15, 0, 48, 2, 63, 91)
(14, 2, 59, 94, 94, 47)
(13, 2, 31, 9, 79, 54)
(14, 1, 65, 32, 67, 35)
(15, 2, 13, 35, 71, 30)
(15, 1, 44, 47, 43, 55)
(14, 0, 40, 61, 63, 80)
(15, 2, 3, 49, 67, 96)
(14, 0, 17, 87, 16, 50)
(15, 2, 84, 8

In [698]:
def fit(K, marksheet_df, centroids, num_iterations=50):
    # Initialize clusters
    clusters = [[] for _ in range(K)]
    prev_clusters = [[] for _ in range(K)]

    for m in range(num_iterations):
        # Reset clusters
        clusters = [[] for _ in range(K)]
        
        # Assign each data point to the closest centroid
        for i in range(len(marksheet_df)):
            vec = np.array(marksheet_df.iloc[i, -6:])
            min_dist = float("inf")
            idx = 0
            for j in range(len(centroids)):
                dist = np.sum(np.abs(centroids[j] - vec))
                if dist < min_dist:
                    min_dist = dist
                    idx = j
            clusters[idx].append(i)  # Append index of data point
        
        # Update centroids
        for i in range(len(clusters)):
            if clusters[i]:
                cluster_points = marksheet_df.iloc[clusters[i], -6:]
                centroids[i] = np.mean(cluster_points, axis=0)

        if (m % 10 == 0):
            print(f'Completed iteration {m}')

        # Check for convergence
        if np.array_equal(prev_clusters, clusters):
            break
        else:
            prev_clusters = clusters.copy()

    return clusters, centroids

In [699]:
clusters, centroids = fit(K, marksheet_df, centroids)

Completed iteration 0
Completed iteration 10


KeyboardInterrupt: 

In [None]:
# Print all groups
for i in range(len(clusters)):
    print(f'Cluster {i+1}: ')
    for j in range(len(clusters[i])):
        vec = marksheet_df.iloc[clusters[i][j]]
        print(f'{vec[1]}: {vec[2]} {vec[3]} {vec[4]} {vec[5]} {vec[6]} {vec[7]}')
    print("")

Cluster 1: 
Danya: 14 2 12 87 16 96
Gracie: 13 1 93 38 39 93
Benoite: 14 1 3 55 14 90
Livia: 13 2 95 65 25 91
Richie: 15 1 14 83 9 90
Josh: 14 0 54 23 19 94
Brandie: 15 1 43 2 17 78
Everett: 14 0 84 57 25 57
Rollie: 13 0 17 99 29 85
Yardley: 14 2 41 37 26 65
Fanny: 13 2 49 90 13 98
Shannon: 15 1 78 74 31 85
Christan: 14 2 85 6 10 85
Lorrie: 13 0 72 78 40 78
Agace: 15 2 17 75 8 71
Sheffie: 15 2 44 55 51 100
Adolphus: 13 0 19 98 13 86
Silvia: 14 0 40 61 63 80
Woodie: 15 0 37 57 48 75
Rina: 14 2 57 68 13 74
Virgie: 14 1 56 75 5 40
Kath: 14 0 63 74 46 76
Orin: 15 1 70 75 21 99
Fredric: 14 1 68 20 21 81
Alis: 15 0 76 53 32 69
Sarah: 13 1 55 99 8 72
Alard: 13 0 91 12 15 70
Ilaire: 14 2 95 56 4 100
Desmund: 14 0 86 92 8 62
Davidde: 13 2 98 100 23 84
Goldarina: 14 0 27 11 18 92
Ruperto: 14 1 60 55 30 83
Nita: 14 0 92 57 31 75
Baudoin: 14 2 89 59 33 91
Zack: 13 1 72 46 40 90
Buffy: 14 2 58 60 7 90
Tammy: 13 1 81 63 16 62
Stillman: 15 1 50 70 31 98
Ashlin: 14 0 49 47 19 55

Cluster 2: 
Jocelin: 