In [4]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

def euclidean_dist(a, b):
    return np.linalg.norm(a - b)

def single_linkage_dist(cluster_one, cluster_two):
    min_dist = float('inf')
    for pt_one in cluster_one:
        for pt_two in cluster_two:
            dist = euclidean_dist(pt_one, pt_two)
            if dist < min_dist:
                min_dist = dist
    return min_dist

def complete_linkage_dist(cluster_one, cluster_two):
    max_dist = 0.0
    for pt_one in cluster_one:
        for pt_two in cluster_two:
            dist = euclidean_dist(pt_one, pt_two)
            if dist > max_dist:
                max_dist = dist
    return max_dist

def average_linkage_dist(cluster_one, cluster_two):
    total_dist = 0.0
    for pt_one in cluster_one:
        for pt_two in cluster_two:
            total_dist += euclidean_dist(pt_one, pt_two)
    return total_dist / (len(cluster_one) * len(cluster_two))

def ward_linkage_dist(cluster_one, cluster_two):
    mean_cluster_one = np.mean(cluster_one, axis=0)
    mean_cluster_two = np.mean(cluster_two, axis=0)
    return euclidean_dist(mean_cluster_one, mean_cluster_two)

def calculate_dist_matrix(data, linkage_method):
    n_samples = len(data)
    dist_matrix = np.zeros((n_samples, n_samples))

    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            if linkage_method == 'single':
                dist_matrix[i, j] = single_linkage_dist(data[i], data[j])
            elif linkage_method == 'complete':
                dist_matrix[i, j] = complete_linkage_dist(data[i], data[j])
            elif linkage_method == 'average':
                dist_matrix[i, j] = average_linkage_dist(data[i], data[j])
            elif linkage_method == 'ward':
                dist_matrix[i, j] = ward_linkage_dist(data[i], data[j])

    dist_matrix += dist_matrix.T
    return dist_matrix

def hierarchical_clustering(data, linkage_method):
    n_samples = len(data)
    dist_matrix = calculate_dist_matrix(data, linkage_method)

    # Initialize clusters, each containing a single sample
    clusters = [[i] for i in range(n_samples)]
    active_indices = list(range(n_samples))

    while len(active_indices) > 1:
        min_dist = float('inf')
        merge_indices = (0, 0)

        for i in range(len(active_indices)):
            for j in range(i + 1, len(active_indices)):
                if min_dist > dist_matrix[active_indices[i], active_indices[j]]:
                    min_dist = dist_matrix[active_indices[i], active_indices[j]]
                    merge_indices = (i, j)

        i, j = merge_indices
        clusters[active_indices[i]].extend(clusters[active_indices[j]])
        del clusters[active_indices[j]]

        active_indices.remove(active_indices[j])

        new_row = []
        for k in range(len(active_indices) - 1):
            if linkage_method == 'single':
                new_row.append(single_linkage_dist(data[active_indices[k]], data[active_indices[i]]))
            elif linkage_method == 'complete':
                new_row.append(complete_linkage_dist(data[active_indices[k]], data[active_indices[i]]))
            elif linkage_method == 'average':
                new_row.append(average_linkage_dist(data[active_indices[k]], data[active_indices[i]]))
            elif linkage_method == 'ward':
                new_row.append(ward_linkage_dist(data[active_indices[k]], data[active_indices[i]]))
        
        new_row.append(0)  # Add the distance between the merged clusters

        dist_matrix = np.delete(dist_matrix, j, axis=0)
        dist_matrix = np.delete(dist_matrix, j, axis=0)

        dist_matrix = np.insert(dist_matrix, i, new_row, axis=0)
        dist_matrix = np.insert(dist_matrix, i, np.array(new_row + [0]), axis=1)

    if len(clusters) == 1:
        linkage_matrix = np.array(clusters[0], dtype=float).reshape((1, n_samples))
    else:
        linkage_matrix = dist_matrix

    return linkage_matrix

def plot_dendrogram(linkage_matrix):
    plt.figure(figsize=(10, 6))
    dendrogram(linkage_matrix.T)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')
    plt.show()

if __name__ == "__main__":
    np.random.seed(42)
    n_samples = 10
    n_features = 2
    data = np.random.rand(n_samples, n_features)
    linkage_matrix = hierarchical_clustering(data, 'single')
    plot_dendrogram(linkage_matrix)

ValueError: could not broadcast input array from shape (1,9) into shape (1,10)

In [47]:
import pandas as pd
dictone = {'student':['A','B','A','C','D','C','E','H'], 'subject':['M','M','FM','M','M','FM','FM','G'],'grade':[2,3.5,4,5,6,7,8,11]}
df = pd.DataFrame(dictone)
df

Unnamed: 0,student,subject,grade
0,A,M,2.0
1,B,M,3.5
2,A,FM,4.0
3,C,M,5.0
4,D,M,6.0
5,C,FM,7.0
6,E,FM,8.0
7,H,G,11.0


In [50]:
df[df['subject'].isin(['M','FM'])].groupby('student')['grade'].sum().reset_index()

Unnamed: 0,student,grade
0,A,6.0
1,B,3.5
2,C,12.0
3,D,6.0
4,E,8.0


In [46]:
df[df.groupby('student')['grade'].transform('count')]

Unnamed: 0,student,subject,grade
0,A,M,2.0
2,A,FM,4.0
3,C,M,5.0
5,C,FM,7.0


In [46]:
treshold = 1
df = df[df.groupby('student')['grade'].transform('count') > treshold]
df

Unnamed: 0,student,subject,grade
0,A,M,2.0
2,A,FM,4.0
3,C,M,5.0
5,C,FM,7.0


In [45]:
df.groupby('student')['grade'].sum().reset_index()

Unnamed: 0,student,grade
0,A,6.0
1,C,12.0
