
# K MEANS ALGORITHM ----- PARALLEL

In [86]:
%%file exercise3.py

import numpy as np
import pandas as pd
from mpi4py import MPI
import matplotlib.pyplot as plt
from collections import Counter

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
workers = size-1
K=3                                          #<-----------------INITIALIZE NUMBER OF CLUSTERS
gg=np.zeros([K,20])

from sklearn.metrics import pairwise_distances_argmin

def find_clusters(X, n_clusters, rseed=2): #SOURCECode(line 18-37)= https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html
    # 1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    
    while True:
        # 2a. Assign labels based on closest center
        labels = pairwise_distances_argmin(X, centers,metric='euclidean')
        
        # 2b. Find new centers from means of points
        new_centers = np.array([X[labels == i].mean(0)
                                for i in range(n_clusters)])
        
        # 2c. Check for convergence
        if np.all(centers == new_centers):
            break
        centers = new_centers
    
    return centers


if rank==0:
    data = pd.read_csv("Absenteeism_at_work.csv", sep=';')       #LOADING DATA
    data.drop(['ID'], axis = 1, inplace = True)
    B=data.values
    data1=np.array(B, dtype=float)
    split = np.array_split(data1,workers,axis=0 )
    for i in range(1,size):
        comm.send(split[i-1],dest=i)
    
    try:
        for i in range(1,size):
            A = comm.recv(source=i)
            A1=np.array(A, dtype=float)
            gg= gg+A1
        
        pp=gg/workers                       #<----------Averages of the Local Centroids = Global Centroids
        Centroid_Attributes = pd.DataFrame(pp, columns = data.keys())
        pd.set_option("display.max_columns", 500)
        print('\nGlobal Centroid features calculated at MASTER are\n',Centroid_Attributes)
        
    except NameError:
        pass

    

else:

    A = comm.recv(source=0)
    cen=find_clusters(A,K)                #<---------- USING THE DEFINED FUNCTION
    comm.send(cen,dest=0)
    print('\nLocal Centroids sent from WORKER',rank, 'are',cen,'\n')
    

Overwriting exercise3.py


### PARALLEL EXECUTION ---->  input number of Desired Processes

In [89]:
!mpiexec -n 4 python exercise3.py  


Local Centroids sent from WORKER 2 are [[2.15072464e+01 6.60869565e+00 3.72463768e+00 2.47826087e+00
  1.77942029e+02 4.27101449e+01 1.54347826e+01 3.60000000e+01
  2.57550130e+02 9.39420290e+01 2.89855072e-02 1.42028986e+00
  1.73913043e-01 7.24637681e-01 1.30434783e-01 2.89855072e-02
  7.98260870e+01 1.70956522e+02 2.75072464e+01 5.28985507e+00]
 [2.01428571e+01 6.42857143e+00 3.57142857e+00 2.68571429e+00
  1.21171429e+02 1.16285714e+01 1.42285714e+01 4.30571429e+01
  2.62471886e+02 9.46285714e+01 5.71428571e-02 1.00000000e+00
  6.57142857e-01 5.71428571e-01 0.00000000e+00 0.00000000e+00
  9.13142857e+01 1.76971429e+02 2.92000000e+01 1.06571429e+01]
 [1.84195804e+01 7.24475524e+00 3.89510490e+00 2.37062937e+00
  2.70559441e+02 2.87692308e+01 1.20559441e+01 3.72377622e+01
  2.60338406e+02 9.34895105e+01 1.11888112e-01 1.27272727e+00
  1.50349650e+00 6.92307692e-01 6.99300699e-02 7.90209790e-01
  7.91118881e+01 1.70629371e+02 2.70489510e+01 6.69930070e+00]] 


Local Centroids sent fr