### 1.a.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,id,age,income,occupation,region,education_level,satisfaction
0,1,34,52345.67,Engineer,North,Master,High
1,2,27,31234.89,Teacher,South,Bachelor,Medium
2,3,45,67890.12,Doctor,West,PhD,Very High
3,4,19,24567.34,Student,East,High School,Low
4,5,52,78901.45,Manager,North,Bachelor,Medium


### 1.b.

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['age', 'income']] = scaler.fit_transform(df[['age', 'income']])
df.head()

Unnamed: 0,id,age,income,occupation,region,education_level,satisfaction
0,1,-0.384233,-0.149155,Engineer,North,Master,High
1,2,-1.237335,-1.357507,Teacher,South,Bachelor,Medium
2,3,0.956357,0.740588,Doctor,West,PhD,Very High
3,4,-2.212309,-1.739148,Student,East,High School,Low
4,5,1.809459,1.370862,Manager,North,Bachelor,Medium


### 1.c.

In [4]:
# Print all distinct values of the columns occupation, region, education_level, satisfaction
print("Distinct values in 'occupation':", df['occupation'].unique())
print("Distinct values in 'region':", df['region'].unique())
print("Distinct values in 'education_level':", df['education_level'].unique())
print("Distinct values in 'satisfaction':", df['satisfaction'].unique())

Distinct values in 'occupation': ['Engineer' 'Teacher' 'Doctor' 'Student' 'Manager']
Distinct values in 'region': ['North' 'South' 'West' 'East']
Distinct values in 'education_level': ['Master' 'Bachelor' 'PhD' 'High School']
Distinct values in 'satisfaction': ['High' 'Medium' 'Very High' 'Low']


In [5]:
# bugfix
df.columns

Index(['id', 'age', 'income', 'occupation', 'region', 'education_level',
       'satisfaction'],
      dtype='object')

In [6]:
# One hot encoding for columns occupation and region, label encoding for education_level and satisfaction
df = pd.get_dummies(df, columns=['occupation', 'region'])
education_mapping = {'High School': 0, 'Bachelor': 1, 'Master': 2, 'PhD': 3}
df['education_level'] = df['education_level'].map(education_mapping)
satisfaction_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}
df['satisfaction'] = df['satisfaction'].map(satisfaction_mapping)
df.head()

Unnamed: 0,id,age,income,education_level,satisfaction,occupation_Doctor,occupation_Engineer,occupation_Manager,occupation_Student,occupation_Teacher,region_East,region_North,region_South,region_West
0,1,-0.384233,-0.149155,2,2,False,True,False,False,False,False,True,False,False
1,2,-1.237335,-1.357507,1,1,False,False,False,False,True,False,False,True,False
2,3,0.956357,0.740588,3,3,True,False,False,False,False,False,False,False,True
3,4,-2.212309,-1.739148,0,0,False,False,False,True,False,True,False,False,False
4,5,1.809459,1.370862,1,1,False,False,True,False,False,False,True,False,False


In [7]:
# True -> 1, False -> 0 for all columns except age and income
for col in df.columns:
    if col not in ['age', 'income']:
        df[col] = df[col].astype(int)
df.head()

Unnamed: 0,id,age,income,education_level,satisfaction,occupation_Doctor,occupation_Engineer,occupation_Manager,occupation_Student,occupation_Teacher,region_East,region_North,region_South,region_West
0,1,-0.384233,-0.149155,2,2,0,1,0,0,0,0,1,0,0
1,2,-1.237335,-1.357507,1,1,0,0,0,0,1,0,0,1,0
2,3,0.956357,0.740588,3,3,1,0,0,0,0,0,0,0,1
3,4,-2.212309,-1.739148,0,0,0,0,0,1,0,1,0,0,0
4,5,1.809459,1.370862,1,1,0,0,1,0,0,0,1,0,0


### 1.d.

In [8]:
def find_closest_centroids(X, centroids):
    """
    Computes the centroid memberships for every example
    
    Args:
        X (ndarray): (m, n) Input values      
        centroids (ndarray): k centroids
    
    Returns:
        idx (array_like): (m,) closest centroids
    
    """

    K = centroids.shape[0]

    idx = np.zeros(X.shape[0], dtype=int)

    for i in range(X.shape[0]):
          # Array to hold distance between X[i] and each centroids[j]
          distance = [] 
          for j in range(centroids.shape[0]):
              norm_ij = np.linalg.norm(X[i] - centroids[j])
              distance.append(norm_ij)

          idx[i] = np.argmin(distance)

    return idx

In [9]:
def compute_centroids(X, idx, K):
    """
    Returns the new centroids by computing the means of the 
    data points assigned to each centroid.
    
    Args:
        X (ndarray):   (m, n) Data points
        idx (ndarray): (m,) Array containing index of closest centroid for each 
                       example in X. Concretely, idx[i] contains the index of 
                       the centroid closest to example i
        K (int):       number of centroids
    
    Returns:
        centroids (ndarray): (K, n) New centroids computed
    """
    
    m, n = X.shape
    
    centroids = np.zeros((K, n))

    for k in range(K):
        points = X[idx == k]
        centroids[k] = np.mean(points, axis=0)

    return centroids

In [10]:
def kMeans_init_centroids(X, K):
    """
    This function initializes K centroids that are to be 
    used in K-Means on the dataset X
    
    Args:
        X (ndarray): Data points 
        K (int):     number of centroids/clusters
    
    Returns:
        centroids (ndarray): Initialized centroids
    """
    
    # Randomly reorder the indices of examples
    randidx = np.random.permutation(X.shape[0])
    
    # Take the first K examples as centroids
    centroids = X[randidx[:K]]
    
    return centroids

In [11]:
def run_kMeans(X, initial_centroids, max_iters=10):
    """
    Runs the K-Means algorithm on data matrix X, where each row of X
    is a single example
    """
    
    # Initialize values
    m, n = X.shape
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    previous_centroids = centroids    
    idx = np.zeros(m)
    
    # Run K-Means
    for i in range(max_iters):
        
        #Output progress
        print("K-Means iteration %d/%d" % (i, max_iters-1))
        
        # For each example in X, assign it to the closest centroid
        idx = find_closest_centroids(X, centroids)
            
        # Given the memberships, compute new centroids
        centroids = compute_centroids(X, idx, K)
    plt.show() 
    return centroids, idx