In [1]:
import pandas as pd
import numpy as np
""" Customer Segmentation using Similarity Measures
    CSE444 Ravan SADIGLI 20160807005 
    """
class customer_segmentation(object):

  def __init__(self,var1):
    self.var1 = var1

  """
  First, we need to clear the data. So we need to use the pandas library to read the dataset
  (Note that I only use 3rd libraries for data preprocessing, pandas to read the dataset,
  NumPy to convert to the array).  The below method provides us with ready-to-use data
  for calculating both Mahalanobis and Minkowski.
  """  
  def similarity_measure(self):
    # Firstly, we read .xlsx file using pandas library
    data_xls = pd.read_excel('CustomerGroups.xlsx', dtype=str, index_col=None)
    # Then converting .csv and reading .csv to use pandas dataFrame
    data_xls.to_csv('CustomerGroups.csv', encoding='utf-8', index=False)
    df = pd.read_csv("CustomerGroups.csv")
    # In the CustomerGroups.csv file, we have missing data, so we need to clear these data. 
    # To do this, we fill this missing data with the value nan and then drop these rows.
    df=df.replace(r'^\s*$', np.nan, regex=True).dropna()

    # Since, our dataset contains string values, we need to take care of that. 
    # We will convert these string values to integer or float numbers step by step.
    # By using the map() function, we convert string values to 0 or 1, respectively, for Gender column
    df["Gender"] = df["Gender"].map({'Male': 0, 'Female':1})
    # Converting Graduated column 1 or 0, respectively
    df["Graduated"] = df["Graduated"].map({'Yes': 1, 'No':0})
    # In Spending_score column, we have three values low, average and high. We convert these string to integers, low for 0, average 1, high 2 
    df["Spending_Score"] = df["Spending_Score"].map({'Low': 0, 'Average':1,'High':2})
    # We do the same thing for ever_married column
    df["Ever_Married"] = df["Ever_Married"].map({'Yes': 1, 'No':0})


    """
    In the profession, the column describes the job of customers and has a great
    impact on customer segmentation. We cannot do to transform what we did above.
    Because it is not something that is desirable or recommended. Since mapping 
    is not recommended for such a column, we need to apply one-hot encoding for 
    this column. So we create a column for each job, for example, if the client's
    job is engineer we put 1, otherwise we put 0. After applying one hot encoding 
    we have 16 columns (we didn't use the ID column for calculation because 
    it doesn't make sense for distance measurement).
    """
    for col in ["Profession"]:
      df[col] = df[col].astype('category')
    df = pd.get_dummies(data=df,columns=['Profession'])
    
    # Using get_customer_by_ID, we get data about the customers by using ID column
    customer1 = self.get_customer_by_ID(df,"ID",459264)
    customer2 = self.get_customer_by_ID(df,"ID",464608)

    # Converting dataFrame to integer type array 
    customer1np = (customer1.to_numpy()).astype(int)
    customer2np = (customer2.to_numpy()).astype(int)

    print("Minkowski distance: ",self.calculateMinkowski(customer1np[0],customer2np[0],df,4))
    print("Mahalonobis distance: ",self.calculateMahalonobis(customer1np[0],customer2np[0],df))


  def calculateMinkowski(self,customer1, customer2, df, R):
    # Formula: (sum of (customer1 - customer2)^R)^R
    # R - order of the norm
    sum = 0
    n = len(df.columns) # dimension

    for i in range (1,n):
      sum += abs(customer1[i] - customer2[i])**R

    # by using the above formula, it gives us the desired result
    return sum**(1/R)


  """
  To calculate mahalonobis distance, we need covariance matrix. We can achieve 
  this using our dataset.
  """
  def calculateMahalonobis(self,customer1, customer2, df):
    #Formula: (customer1 - customer2) * S ^-1 * (customer1 -customer2)^T
    # Using the formula, we need to calculate 3 things: customer points difference, and its transpose matrix,
    # and covariance matrix. We will do this step by step.

    if len(customer1) == len(customer2):
      d = len(customer1) - 2 # dimension
    else:
      print("ERROR OCCURED")
      return

    # initialization matrix for (customer1 - customer2).
    # it will create matrix 1x15 dimensions
    x_y = [[0 for i in range(d)] for j in range(1)]

    # (customer1 - customer2)
    for i in range (1,d):
      x_y[0][i] = customer1[i] - customer2[i]

    # transpose of customer1 - customer2 matrix with the 15x1 dimensions
    x_yTranspose = [[0 for i in range(1)] for j in range(d)]

    # (customer1 -customer2)^T
    for i in range(len(x_y)):
      for j in range(len(x_y[0])):
          x_yTranspose[j][i] = x_y[i][j]


    # Using the findDatasetCovariance, we can access covariance matrix with 15x15 matrix
    S = self.findDatasetCovariance(df)

    # Using the inverse method, we get inverse of the covariance matrix 
    S_inverse = self.inverse(S)

    # converting (customer1 - customer2), inverse of covariance matrix, and tranpose(customer1 - customer2)
    x_y = self.listToArray(x_y)
    x_yTranspose = self.listToArray(x_yTranspose)
    S_inverse =  self.listToArray(S_inverse)

    # variable for the holding multiplication of covariance matrix and
    # transpose of customers' point differences. it will have 1x15 dimensions
    tempMatrix = [[0 for i in range(1)] for j in range(d)]

    # nested loop for the  multiplication of covariance matrix and
    # transpose of customers' point differences. 15x15 dimension * 15x1 dimensions respectively
    for i in range(len(S_inverse[0])):
      for j in range(len(x_yTranspose[0][0])):
          for k in range(len(S_inverse[0])):
              tempMatrix[i][j] += S_inverse[0][i][k] * x_yTranspose[0][k][j]

    # multiplication of (customer1 - customer2) * tempMatrix. 1x15 dimension * 15x1 dimension
    # and, result of the multiplication will give scalar value. 
    finalResult = 0
    for i in range(0,d):
      finalResult += x_y[0][0][i] * tempMatrix[i][0]
    
    # (1x15) * (15x15) * (15x1) = scalar value = mahalonobis distance
    return finalResult**(1.0/2)

  """ This method provides us to generate covariance matrix like desribed below with 15x15 dimensions:

                      Gender      Age       Graduated     . . . . . .
  Gender              var(G,G)    cov(G,A)  cov(G,G)      . . . . . .
  Age                 cov(E,G)    var(E,A)  cov(E,G)      . . . . . .  
  Graduated           cov(G,G)    cov(G,A)  var(G,G)      . . . . . . 
  .                    .            .         .                 
  .                    .            .         .   
  .                    .            .         .
  Profession_Lawyer   cov(P,G)    cov(P,A)  cov(P,G)      . . . . . . 
  """
  def findDatasetCovariance(self, df):
    # the array that holds column names to access data  
    dataSetNames = ["Gender", "Ever_Married", "Age", "Graduated", "Work_Experience",
                    "Spending_Score","Family_Size","Profession_Artist", "Profession_Doctor",
                   "Profession_Engineer", "Profession_Entertainment", "Profession_Executive",
                    "Profession_Homemaker", "Profession_Lawyer", "Profession_Marketing"]

    #initialization of the covariance matrix
    cov_of_dataset = [[0 for i in range(len(dataSetNames))] for j in range(len(dataSetNames))]

    # Using the nested for loop, we create covariance matrix.
    for i in range(len(dataSetNames)):
      for j in range(len(dataSetNames)):
       cov_of_dataset[i][j] = self.cov((df[dataSetNames[i]].to_numpy()).astype(int)
       ,(df[dataSetNames[j]].to_numpy()).astype(int)) 

    return cov_of_dataset

  # This method calculate covariance for each attribute using covariance matrix formula
  def cov(self, a, b):
    # Formula: (sum of((column1 - mean of column1)* (column2 -mean of column2)))/n
      if len(a) != len(b):
          return

      # calculating mean for each column using method
      b_mean = self.mean(b)
      a_mean = self.mean(a)

      sum = 0

      # Implementation of the formula
      for i in range(0, len(a)):
          sum += ((a[i] - a_mean) * (b[i] - b_mean))

      return sum/(len(a))

  # finding mean of the dataset
  def mean(self, a):
    sum = 0 

    for i in range(0, len(a)):
      sum += a[i] 

    return sum/(len(a))

  # This methods provides us to calculate inverse of 15x15 
  # covariance matrix using Gauss Jordan method
  def inverse(self, covariance):

    n = len(covariance)
    # initializing to zeros for storing augmented matrix with 15x30 dimensions
    inverse = [[0 for i in range(2*len(covariance))] for j in range(len(covariance))]

    for i in range(n):
      for j in range(n):
        inverse[i][j] = covariance[i][j]

    # creating augmented matrix
    for i in range(n):
      for j in range(n):
        if i == j:
          inverse[i][j+n] = 1

    # Applying Gauss Jordan method
    for i in range(n):
      if inverse[i][i] == 0:
        print("ERROR OCCURED")
      for j in range(n):
        if i != j:
          ratio = inverse[j][i]/inverse[i][i]
          for k in range(2*n):
            inverse[j][k] = inverse[j][k] - ratio * inverse[i][k]

    # Row operation to make diagonal element to 1
    for i in range(n):
      divisor = inverse[i][i]
      for j in range(n,2*n):
        inverse[i][j] = inverse[i][j]/divisor
    inverseList = []

    # converting to list
    for i in range(n):
        for j in range(n, 2*n):
          inverseList.append(inverse[i][j])

    # converting list to array using listToMatrix method
    inverseMatrix = self.listToArray(inverseList)
    return inverseMatrix

  def listToArray(self, a):
    m = []
    while a != []:
        m.append(a[:15])
        a = a[15:]
    return m

  # gettting customer's data from the csv file 
  def get_customer_by_ID(self, df, col, val):
    return df[df[col]==val]

In [7]:
hw = customer_segmentation(0)
hw.similarity_measure()

Minkowski distance:  7.0021855651307225
Mahalonobis distance:  16.177404391705053
