# <span style="font-size:3.5rem">Convergent Clusters</span>

#### imports

In [None]:
!pip install wNMF
!pip install gspread_dataframe



In [1]:
from scipy import stats # stats
from scipy.stats import binom
import numpy as np # math
import matplotlib.pyplot as plt # graph
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from wNMF import wNMF
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.decomposition import NMF
from sklearn.mixture import GaussianMixture

from IPython.display import display, Markdown, Latex # to display formatted output

#from google.colab import files
import io

def generate_responses_binom(num_samples, p=0.5, scale_min=1, scale_max=10, mean=5.5, seed=0):
  n = 1000
  noise_percent = 0 # percent of people answering randomly

  distribution = stats.binom(n=scale_max-scale_min,loc=scale_min,p=p)
  np.random.seed(seed)
  noise = np.random.choice(np.arange(scale_min,scale_max+1),int(n*noise_percent))

  sample = np.append(distribution.rvs(size=n-len(noise),
                  random_state=1), noise)
    
  return sample

def generate_responses_norm(num_samples, desired_mean, desired_std_dev, round=False, force_positive=False, seed=0):
  np.random.seed(seed)
  samples = np.random.normal(loc=0.0, scale=desired_std_dev, size=num_samples)

  actual_mean = np.mean(samples)
  actual_std = np.std(samples)

  zero_mean_samples = samples - (actual_mean)
  zero_mean_mean = np.mean(zero_mean_samples)
  zero_mean_std = np.std(zero_mean_samples)

  scaled_samples = zero_mean_samples * (desired_std_dev/zero_mean_std)
  scaled_mean = np.mean(scaled_samples)
  scaled_std = np.std(scaled_samples)

  final_samples = scaled_samples + desired_mean
  if round:
    final_samples = final_samples.round()
  if force_positive:
    final_samples[np.where(final_samples <= 1)] = 1
    final_samples[np.where(final_samples >= 10)] = 10
  final_mean = np.mean(final_samples)
  final_std = np.std(final_samples)
  return final_samples

def generate_group_data_norm(group, desired_std_dev, round=False, force_positive=False, seed=0):
  all_responses = [generate_responses_norm(group['size'], desired_mean, 
                                           desired_std_dev, round=round, force_positive=force_positive, seed=seed)
                   for desired_mean in group['means']]
  return np.array(all_responses).T


def generate_survey_data_norm(groups, desired_std_dev, round=False, force_positive=False, seed=0):
  all_data = []
  for group in groups:
    group_data = generate_group_data_norm(groups[group], desired_std_dev, round=round, force_positive=force_positive, seed=seed)
    all_data += group_data.tolist()
  return pd.DataFrame(np.array(all_data))


def get_group_data(test, group):
  group_info = test['group_info']
  cum_sum_of_sizes = np.cumsum([0]+[group_info[group]['size'] for group in group_info])
  data = test['data']
  group_start = cum_sum_of_sizes[group-1]
  group_end = cum_sum_of_sizes[group]
  return data.iloc[group_start:group_end]


def describe_groups(data, group_info):
  cum_sum_of_sizes = np.cumsum([0]+[group_info[group]['size'] for group in group_info])
  all_stats = []
  index = []
  for group in group_info:
    start = cum_sum_of_sizes[group-1]
    end = cum_sum_of_sizes[group]
    all_stats += data.iloc[start:end].describe().loc[['mean', 'std']].values.tolist()
    index += [f'group{group} mean', f'group{group} std']
  df = pd.DataFrame(all_stats, index = index)
  new_order = []
  for metric in ['mean', 'std']:
    for group in group_info:
      new_order += [f'group{group} {metric}']
  return df.loc[new_order]

def get_iloc_max(df):
  return np.unravel_index(np.argmax(df.values), np.array(df.values).shape)
  
def get_hit_rates(crosstab):
    # for each true segment, finds the predicted segment with greatest hitrate
    # allows overlap
    # hit_rates=[]
    # for true_seg in crosstab.index:
    #     max_loaded = crosstab.loc[true_seg].idxmax()
    #     true = crosstab.loc[true_seg, max_loaded]
    #     predicted = crosstab[max_loaded].sum()
    #     hit_rate = true/predicted
    #     hit_rates.append(hit_rate.round(3))
    # hit_rates = pd.Series(hit_rates, index = crosstab.index)

    # same as above, however doesn't allow overlap.
    # highest hitrate matchup is locked in
    total_predicted = crosstab.sum()
    possible_hitrates = crosstab / total_predicted
    hitrates = [0 for _ in range(len(possible_hitrates.index))]

    for _ in range(len(hitrates)): # for every true segment
      next_best_hitrate = get_iloc_max(possible_hitrates)

      # capture hitrate
      hitrate = possible_hitrates.iloc[next_best_hitrate]
      hitrates[next_best_hitrate[0]] = hitrate

      # zero out true and predicted segments (so they don't get picked again)
      possible_hitrates.iloc[next_best_hitrate[0], :] = 0
      possible_hitrates.iloc[:, next_best_hitrate[1]] = 0
    return hitrates


def get_solution_hit_rate(true, predicted):
    """
    Returns maximum possible hitrate for a solution
    by matching up predicted and true segments based on
    highest number of resps predicted
    """
    crosstab = pd.crosstab(true_group_list, predicted)
    # true predictions / all predictions
    total_predicted = crosstab.sum()
    possible_hitrates = crosstab / total_predicted
    true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
    n = crosstab.sum().sum()

    for _ in range(len(true_predictions)): # for every true segment
        next_best_hitrate = get_iloc_max(crosstab)

        num_correct = crosstab.iloc[next_best_hitrate]
        if num_correct != 0:
             # capture true predictions
            true_predictions[next_best_hitrate[0]] = num_correct

        # zero out true and predicted segments (so they don't get picked again)
        crosstab.iloc[next_best_hitrate[0], :] = 0
        crosstab.iloc[:, next_best_hitrate[1]] = 0

    return sum(true_predictions) / n



def display_hitrates(true_group_list, solutions, display_matrix = True):
  cmap = sns.light_palette("green", as_cmap=True) #sns.diverging_palette(220, 20, sep=100, as_cmap=True).reversed()
  number_of_segments_in_original = len(true_group_list.unique())

  for solution in solutions:
      number_of_segments_in_solution = int(solution.split('_')[1])#len(solutions[solution].unique())
      if number_of_segments_in_original != number_of_segments_in_solution:
        #print(f'{solution} -- Hit Rate not shown. (Does not have {number_of_segments_in_original} segments)')
        continue
      predicted_segments = solutions[solution]
      hit_rate = get_solution_hit_rate(true_group_list, predicted_segments)

      # hit_rates = get_hit_rates(crosstab)
      # avg_hit_rate = np.mean(hit_rates)
      crosstab = pd.crosstab(true_group_list, solutions[solution])
      
      display(Markdown(f'### {solution} -- Hit Rate = {hit_rate:.1%}'))
      if display_matrix:
        display(crosstab.style.background_gradient(cmap=cmap))

def recode(solutions):
  coded_solutions = pd.DataFrame(index=solutions.index)

  for solution in solutions:
      segments = sorted(solutions[solution].unique())
      for segment in segments:
          is_segment = (solutions[solution] == segment).astype(int)
          coded_solutions[f'{solution}_{segment}'] = is_segment

  return coded_solutions

def rmse(actual_group, predicted_group):
  errors = predicted_group.mean() - actual_group.mean();
  squared_errors = errors ** 2
  mean_squared_error = squared_errors.mean()
  root_mean_squared_error = np.sqrt(mean_squared_error)
  return root_mean_squared_error

def get_predicted_group_data(test, group, solution):
  true_group_list = test['true_group_list']
  actual_group_data = get_group_data(test, 1)
  all_data = tests['test1']['data']

  crosstab = pd.crosstab(true_group_list, solution);crosstab
  corresponding_predicted_segment = crosstab.loc[group].idxmax()

  predicted_group_data = all_data[solution == corresponding_predicted_segment]
  return predicted_group_data

def cluster(data, methods=['kmeans', 'hierarchical', 'NMF'], n_cluster_start=2, n_cluster_end=5, seed=0):
  solutions = pd.DataFrame()
  for i in range(n_cluster_start, n_cluster_end+1):
    if 'kmeans' in methods:
      kmeans = KMeans(n_clusters=i,random_state=seed).fit(data)
      solutions[f'kmeans_{i}'] = kmeans.labels_
      solutions[f'kmeans_{i}'] = solutions[f'kmeans_{i}'] + 1
      #print(solutions['kmeans'].value_counts().sort_index())

    if 'hierarchical' in methods:
      heirachical = AgglomerativeClustering(n_clusters=i).fit(data)
      solutions[f'hierarchical_{i}'] = heirachical.labels_
      #solutions[f'heirarchical{i}'].value_counts().sort_index()

    if 'wNMF' in methods:
      data_positive = data - data.min(axis=0).min()
      wnmf = wNMF(n_components=i, max_iter=2000, beta_loss='kullback-leibler', random_state=seed, verbose=0).fit(data_positive.values.astype('float'), W=np.ones(data.shape),n_run=5)
      solutions[f'wNMF_{i}'] = pd.DataFrame(wnmf.components_).idxmax(axis=1)
      solutions[f'wNMF_{i}'] = solutions[f'wNMF_{i}'] + 1
    
    if 'tunedNMF' in methods:
      data_positive = data - data.min(axis=0).min()
      tunedNMF = NMF(n_components=i, max_iter=100000, beta_loss='kullback-leibler', solver='mu', init="random", verbose=0, random_state=seed).fit(data_positive.values.astype('float').T)
      solutions[f'tunedNMF_{i}'] = pd.DataFrame(tunedNMF.components_).idxmax(axis=0)
      solutions[f'tunedNMF_{i}'] = solutions[f'tunedNMF_{i}'] + 1
      #solutions['NMF'].value_counts().sort_index()
      #display(Markdown(f'# {test}'))
      #display_hitrates(true_group_list, solutions)
    
    if 'NMF' in methods:
      data_positive = data - data.min(axis=0).min()
      nmf = NMF(n_components=i, max_iter=100000,  verbose=0, random_state=seed).fit(data_positive.values.astype('float').T)
      solutions[f'NMF_{i}'] = pd.DataFrame(nmf.components_).idxmax(axis=0)
      solutions[f'NMF_{i}'] = solutions[f'NMF_{i}'] + 1
        
        
    if 'LC' in methods:
      lc = GaussianMixture(n_components=i)
      lc.fit(data)

      #predictions from gmm
      solutions[f'LC_{i}'] = lc.predict(data)
      solutions[f'LC_{i}'] = solutions[f'LC_{i}'] + 1
  return solutions

def generate_true_group_list(group_info):
  true_group_list = []
  for group in group_info:
    true_group_list += [group] * group_info[group]['size']
  true_group_list = pd.Series(true_group_list, name='TRUE')
  return true_group_list

In [2]:
tests = {}
round = False
force_positive = False
seed = 0

#### Test #1:  extreme group sizes, standard deviation of error=1.5

In [3]:
tests['test1'] = {
      'group_info': {
        1: {
            'means': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1],
            'size': 100
          },
        2: {
            'means': [2, 3, 1, 2, 3, 1, 2, 3, 1, 2],
            'size': 300
          },
        3: {
            'means': [3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
            'size': 600
          },
  }
}

std_dev = 1.5

tests['test1']['data'] = generate_survey_data_norm(tests['test1']['group_info'], std_dev, round=round, force_positive=force_positive, seed=seed)
describe_groups(tests['test1']['data'], tests['test1']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0
group3 mean,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0
group1 std,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557
group2 std,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506
group3 std,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252


#### Test #2:  moderately different-sized groups, standard deviation of error=2

In [4]:
std_dev = 2
new_sizes = [200, 300, 500]
tests['test2'] = {'group_info': {
                        group: {
                        'means': tests['test1']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test1']['group_info']}
                  }

tests['test2']['data'] = generate_survey_data_norm(tests['test2']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test2']['data'], tests['test2']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0
group3 mean,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0
group1 std,2.005019,2.005019,2.005019,2.005019,2.005019,2.005019,2.005019,2.005019,2.005019,2.005019
group2 std,2.003342,2.003342,2.003342,2.003342,2.003342,2.003342,2.003342,2.003342,2.003342,2.003342
group3 std,2.002003,2.002003,2.002003,2.002003,2.002003,2.002003,2.002003,2.002003,2.002003,2.002003


#### Test #3:  equal groups, standard deviation of error=2

In [5]:
std_dev = 2
new_sizes = [333, 333, 334]
tests['test3'] = {'group_info': {
                        group: {
                        'means': tests['test1']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test1']['group_info']}
                }

tests['test3']['data'] = generate_survey_data_norm(tests['test3']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test3']['data'], tests['test3']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0
group3 mean,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0
group1 std,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301
group2 std,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301,2.00301
group3 std,2.003001,2.003001,2.003001,2.003001,2.003001,2.003001,2.003001,2.003001,2.003001,2.003001


#### Test #4:  group 3 now overlaps with 1 and 2

In [6]:
tests['test4'] = tests['test1']
tests['test4']['group_info'][3] = {
            'means': [2, 2, 1, 1, 3, 3, 2, 2, 1, 1],
            'size': 600
}

std_dev = 1.5

tests['test4']['data'] = generate_survey_data_norm(tests['test4']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test4']['data'], tests['test4']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0
group3 mean,2.0,2.0,1.0,1.0,3.0,3.0,2.0,2.0,1.0,1.0
group1 std,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557
group2 std,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506
group3 std,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252


#### Test #5:  test 4 but groups 1 and 3 switch sizes

In [7]:
new_sizes = [600, 300, 100]
tests['test5'] = {'group_info': {
                        group: {
                        'means': tests['test4']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test4']['group_info']}
                  }
std_dev = 1.5

tests['test5']['data'] = generate_survey_data_norm(tests['test5']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test5']['data'], tests['test5']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0
group3 mean,2.0,2.0,1.0,1.0,3.0,3.0,2.0,2.0,1.0,1.0
group1 std,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252,1.501252
group2 std,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506
group3 std,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557


In [8]:
tests['test7'] = {
      'group_info': {
        1: {
            'means': [6, 4, 4, 1, 10, 4, 6, 1, 7, 1],
            'size': 300
          },
        2: {
            'means': [4, 5, 8, 5, 5, 8, 7, 3, 5, 2],
            'size': 50
          },
        3: {
            'means': [10, 4, 4, 2, 5, 10, 7, 3, 4, 8],
            'size': 100
          },
        4: {
            'means': [5, 2, 2, 8, 8, 5, 2, 4, 3, 1],
            'size': 200
          },
        5: {
            'means': [2, 3, 4, 9, 2, 5, 5, 10, 4, 10],
            'size': 150
          },
        6: {
            'means': [2, 5, 10, 6, 7, 10, 9, 9, 3, 4],
            'size': 200
          },

  }
}

std_dev = 1.5

tests['test7']['data'] = generate_survey_data_norm(tests['test7']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test7']['data'], tests['test7']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,6.0,4.0,4.0,1.0,10.0,4.0,6.0,1.0,7.0,1.0
group2 mean,4.0,5.0,8.0,5.0,5.0,8.0,7.0,3.0,5.0,2.0
group3 mean,10.0,4.0,4.0,2.0,5.0,10.0,7.0,3.0,4.0,8.0
group4 mean,5.0,2.0,2.0,8.0,8.0,5.0,2.0,4.0,3.0,1.0
group5 mean,2.0,3.0,4.0,9.0,2.0,5.0,5.0,10.0,4.0,10.0
group6 mean,2.0,5.0,10.0,6.0,7.0,10.0,9.0,9.0,3.0,4.0
group1 std,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506,1.502506
group2 std,1.515229,1.515229,1.515229,1.515229,1.515229,1.515229,1.515229,1.515229,1.515229,1.515229
group3 std,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557,1.507557
group4 std,1.503764,1.503764,1.503764,1.503764,1.503764,1.503764,1.503764,1.503764,1.503764,1.503764


In [9]:
tests['test8'] = {
      'group_info': {
        1: {
            'means': [7,8,8,4,4,6,7,8,4,4],
            'size': 300
          },
        2: {
            'means': [5,6,5,5,5,8,8,7,8,8],
            'size': 50
          },
        3: {
            'means': [4,4,4,4,5,6,6,4,6,6],
            'size': 100
          },
        4: {
            'means': [5,4,4,6,7,6,7,5,4,4],
            'size': 200
          },
        5: {
            'means': [8,6,6,5,5,5,5,7,6,5],
            'size': 150
          },
        6: {
            'means': [4,5,4,6,5,8,8,6,7,6],
            'size': 200
          },
          

  }
}

std_dev = 2.2360679775

tests['test8']['data'] = generate_survey_data_norm(tests['test8']['group_info'], std_dev, round=round, force_positive=force_positive, seed=seed)
describe_groups(tests['test8']['data'], tests['test8']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,7.0,8.0,8.0,4.0,4.0,6.0,7.0,8.0,4.0,4.0
group2 mean,5.0,6.0,5.0,5.0,5.0,8.0,8.0,7.0,8.0,8.0
group3 mean,4.0,4.0,4.0,4.0,5.0,6.0,6.0,4.0,6.0,6.0
group4 mean,5.0,4.0,4.0,6.0,7.0,6.0,7.0,5.0,4.0,4.0
group5 mean,8.0,6.0,6.0,5.0,5.0,5.0,5.0,7.0,6.0,5.0
group6 mean,4.0,5.0,4.0,6.0,5.0,8.0,8.0,6.0,7.0,6.0
group1 std,2.239804,2.239804,2.239804,2.239804,2.239804,2.239804,2.239804,2.239804,2.239804,2.239804
group2 std,2.25877,2.25877,2.25877,2.25877,2.25877,2.25877,2.25877,2.25877,2.25877,2.25877
group3 std,2.247333,2.247333,2.247333,2.247333,2.247333,2.247333,2.247333,2.247333,2.247333,2.247333
group4 std,2.241679,2.241679,2.241679,2.241679,2.241679,2.241679,2.241679,2.241679,2.241679,2.241679


## Custom Data

In [10]:
raw_read = pd.read_csv('Data_mike.csv')
data = raw_read[[col for col in raw_read if col.startswith('v')]]
true_group_list = raw_read['Actual_Seg']

tests['testMike'] = {
    'data' : data,
    'true_group_list': true_group_list
}

In [68]:
solutions = cluster(data, n_cluster_start=6,
                                     n_cluster_end=6, methods=
                                     ['NMF', 'tunedNMF'])#, 'NMF'])  # 'kmeans', 'hierarchical', 
display_hitrates(true_group_list, solutions, display_matrix=True)

### tunedNMF_6 -- Hit Rate = 34.4%

tunedNMF_6,1,2,3,4,5,6
Actual_Seg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,17,57,25,47,29,125
2,7,13,1,7,1,21
3,25,2,6,20,4,43
4,27,3,27,82,15,46
5,8,23,41,41,7,30
6,40,12,7,15,58,68


### NMF_6 -- Hit Rate = 32.0%

NMF_6,1,2,3,4,5,6
Actual_Seg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,126,43,13,60,17,41
2,28,2,6,6,2,6
3,26,20,33,15,3,3
4,37,90,38,15,16,4
5,69,57,4,2,4,14
6,84,8,37,12,51,8


### Cluster

In [69]:
for test in tests:
    display(Markdown(f'# {test}'))
    data = tests[test]['data']
    group_info = tests[test]['group_info']
    true_group_list = generate_true_group_list(group_info)
    tests[test]['true_group_list'] = true_group_list
    tests[test]['solutions'] = cluster(data, seed=seed, n_cluster_start=6,
                                     n_cluster_end=6, methods=
                                     ['kmeans', 'hierarchical', 'NMF', 'tunedNMF', 'LC'])#, 'NMF'])  # 'kmeans', 'hierarchical', 
    display(Markdown(f"## {test} Original Solutions"))
    display_hitrates(true_group_list, tests[test]['solutions'], display_matrix=True)
    recoded_solutions = recode(tests[test]['solutions'])
    tests[test]['recoded_solutions'] = recoded_solutions
    display(Markdown(f"## {test} Ensemble Solutions"))
    
    actual_num_of_segs = len(true_group_list.unique())
    ensemble_solutions = cluster(recoded_solutions, n_cluster_start=actual_num_of_segs,
                                 methods=['kmeans', 'hierarchical', 'NMF', 'LC'],
                                 seed=seed, n_cluster_end=actual_num_of_segs)
    display_hitrates(true_group_list, ensemble_solutions)
    tests[test]['ensemble_solutions'] = ensemble_solutions


# test1

## test1 Original Solutions

## test1 Ensemble Solutions

### kmeans_3 -- Hit Rate = 65.0%

kmeans_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,79,21
2,227,5,68
3,0,402,198


### hierarchical_3 -- Hit Rate = 58.7%

hierarchical_3,0,1,2
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,93,0,7
2,0,211,89
3,369,0,231


### NMF_3 -- Hit Rate = 74.3%

NMF_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,70,0,30
2,14,286,0
3,427,0,173


### LC_3 -- Hit Rate = 74.4%

LC_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,70,30
2,244,56,0
3,0,470,130


# test2

## test2 Original Solutions

## test2 Ensemble Solutions

### kmeans_3 -- Hit Rate = 100.0%

kmeans_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,200,0,0
2,0,0,300
3,0,500,0


### hierarchical_3 -- Hit Rate = 99.3%

hierarchical_3,0,1,2
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,200
2,7,293,0
3,500,0,0


### NMF_3 -- Hit Rate = 99.8%

NMF_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,200
2,2,298,0
3,500,0,0


### LC_3 -- Hit Rate = 100.0%

LC_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,200
2,0,300,0
3,500,0,0


# test3

## test3 Original Solutions

## test3 Ensemble Solutions

### kmeans_3 -- Hit Rate = 100.0%

kmeans_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,333,0,0
2,0,333,0
3,0,0,334


### hierarchical_3 -- Hit Rate = 80.5%

hierarchical_3,0,1,2
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,333,0,0
2,0,333,0
3,195,0,139


### NMF_3 -- Hit Rate = 100.0%

NMF_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,333,0,0
2,0,333,0
3,0,0,334


### LC_3 -- Hit Rate = 100.0%

LC_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,333
2,0,333,0
3,334,0,0


# test4

## test4 Original Solutions

## test4 Ensemble Solutions

### kmeans_3 -- Hit Rate = 61.3%

kmeans_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,28,0,72
2,89,211,0
3,226,0,374


### hierarchical_3 -- Hit Rate = 58.7%

hierarchical_3,0,1,2
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,93,0,7
2,0,211,89
3,369,0,231


### NMF_3 -- Hit Rate = 61.9%

NMF_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,72,0,28
2,0,211,89
3,380,0,220


### LC_3 -- Hit Rate = 91.1%

LC_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,100
2,211,89,0
3,0,600,0


# test5

## test5 Original Solutions

## test5 Ensemble Solutions

### kmeans_3 -- Hit Rate = 62.5%

kmeans_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,248,352
2,209,91,0
3,36,64,0


### hierarchical_3 -- Hit Rate = 67.5%

hierarchical_3,0,1,2
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,231,0,369
2,94,206,0
3,100,0,0


### NMF_3 -- Hit Rate = 64.1%

NMF_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,352,0,248
2,0,261,39
3,15,57,28


### LC_3 -- Hit Rate = 81.9%

LC_3,1,2,3
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,90,510,0
2,91,0,209
3,100,0,0


# test7

## test7 Original Solutions

### kmeans_6 -- Hit Rate = 81.3%

kmeans_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,152,0,0,148,0,0
2,0,38,0,12,0,0
3,0,0,0,1,0,99
4,0,0,0,0,200,0
5,0,0,150,0,0,0
6,0,200,0,0,0,0


### hierarchical_6 -- Hit Rate = 90.3%

hierarchical_6,0,1,2,3,4,5
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,300,0,0,0,0,0
2,0,0,0,33,0,17
3,0,0,0,0,100,0
4,0,200,0,0,0,0
5,0,0,150,0,0,0
6,0,0,0,80,0,120


### tunedNMF_6 -- Hit Rate = 93.1%

tunedNMF_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,300,0
2,0,0,0,9,0,41
3,100,0,0,0,0,0
4,0,0,0,0,0,200
5,0,19,131,0,0,0
6,0,0,0,200,0,0


### NMF_6 -- Hit Rate = 88.8%

NMF_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,300,0,0,0,0,0
2,0,0,0,0,0,50
3,0,0,0,95,0,5
4,0,0,0,0,147,53
5,0,1,0,0,0,149
6,0,0,197,0,0,3


### LC_6 -- Hit Rate = 95.0%

LC_6,1,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,300,0,0,0
2,50,0,0,0,0
3,0,0,0,0,100
4,0,0,0,200,0
5,0,0,150,0,0
6,200,0,0,0,0


## test7 Ensemble Solutions

### kmeans_6 -- Hit Rate = 80.2%

kmeans_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,148,0,0,0,152,0
2,0,50,0,0,0,0
3,0,0,0,0,0,100
4,0,0,200,0,0,0
5,0,0,0,150,0,0
6,0,200,0,0,0,0


### hierarchical_6 -- Hit Rate = 80.2%

hierarchical_6,0,1,2,3,4,5
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,152,0,0,148
2,50,0,0,0,0,0
3,0,0,0,0,100,0
4,0,200,0,0,0,0
5,0,0,0,150,0,0
6,200,0,0,0,0,0


### NMF_6 -- Hit Rate = 80.2%

NMF_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,152,0,0,0,0,148
2,0,50,0,0,0,0
3,0,0,0,0,100,0
4,0,0,200,0,0,0
5,0,0,0,150,0,0
6,0,200,0,0,0,0


### LC_6 -- Hit Rate = 91.7%

LC_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,300,0,0,0,0,0
2,0,20,30,0,0,0
3,0,0,0,0,100,0
4,0,53,0,0,0,147
5,0,0,0,150,0,0
6,0,0,200,0,0,0


# test8

## test8 Original Solutions

### kmeans_6 -- Hit Rate = 35.6%

kmeans_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,21,120,121,38,0
2,18,6,0,0,11,15
3,21,29,0,0,7,43
4,49,60,0,0,17,74
5,0,22,50,57,21,0
6,71,34,0,0,26,69


### hierarchical_6 -- Hit Rate = 40.6%

hierarchical_6,0,1,2,3,4,5
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,134,21,7,138,0,0
2,0,11,7,0,15,17
3,0,7,29,0,43,21
4,0,17,80,0,54,49
5,80,14,1,55,0,0
6,0,35,35,0,76,54


### tunedNMF_6 -- Hit Rate = 85.7%

tunedNMF_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,300,0,0
2,3,0,2,0,0,45
3,3,0,0,0,0,97
4,0,0,0,0,200,0
5,37,103,0,10,0,0
6,46,0,154,0,0,0


### NMF_6 -- Hit Rate = 51.2%

NMF_6,1,2,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,184,116,0,0,0
2,0,0,0,0,50
3,77,0,0,21,2
4,0,0,0,0,200
5,43,0,107,0,0
6,0,0,0,0,200


### LC_6 -- Hit Rate = 64.2%

LC_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,169,0,131
2,0,0,50,0,0,0
3,0,0,0,0,100,0
4,0,0,0,0,200,0
5,0,0,0,85,0,65
6,42,158,0,0,0,0


## test8 Ensemble Solutions

### kmeans_6 -- Hit Rate = 39.0%

kmeans_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,21,141,0,138,0
2,15,11,0,6,0,18
3,43,7,0,29,0,21
4,74,17,0,60,0,49
5,0,14,71,1,64,0
6,69,26,0,34,0,71


### hierarchical_6 -- Hit Rate = 42.4%

hierarchical_6,0,1,2,3,4,5
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,159,141,0,0
2,50,0,0,0,0,0
3,28,43,0,0,29,0
4,66,74,0,0,60,0
5,0,0,85,0,0,65
6,97,69,0,0,34,0


### NMF_6 -- Hit Rate = 55.4%

NMF_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,169,131,0,0,0
2,6,0,0,11,15,18
3,29,0,0,0,50,21
4,97,0,0,0,54,49
5,1,78,71,0,0,0
6,33,0,0,149,4,14


### LC_6 -- Hit Rate = 54.8%

LC_6,1,2,3,4,5,6
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,141,0,0,0,0,159
2,0,40,7,3,0,0
3,0,71,29,0,0,0
4,0,120,80,0,0,0
5,1,0,0,0,78,71
6,0,4,34,162,0,0


# Run Approaches

In [17]:
approaches = [
    {  #1
     'cluster': ['kmeans'],
     'ensemble': None
    },
    {  #2
     'cluster': ['LC'],
     'ensemble': None
    },
    {  #3
     'cluster': ['NMF'],
     'ensemble': None
    },
    {  #4
     'cluster': ['tunedNMF'],
     'ensemble': None
    },
#     {  #5
#      'cluster': ['kmeans', 'hierarchical'],
#      'ensemble': 'kmeans'
#     },
#     {  #6
#      'cluster': ['kmeans', 'hierarchical'],
#      'ensemble': 'LC'
#     },
#     {  #7
#      'cluster': ['kmeans', 'hierarchical'],
#      'ensemble': 'NMF'
#     },
#     {  #8
#      'cluster': ['kmeans', 'hierarchical'],
#      'ensemble': 'tunedNMF'
#     },
#     {  #9
#      'cluster': ['kmeans', 'hierarchical', 'LC', 'NMF'],
#      'ensemble': 'kmeans'
#     },
#     {  #10
#      'cluster': ['kmeans', 'hierarchical', 'LC', 'NMF'],
#      'ensemble': 'LC'
#     },
#     {  #11
#      'cluster': ['kmeans', 'hierarchical', 'LC', 'NMF'],
#      'ensemble': 'NMF'
#     },
#     {  #12
#      'cluster': ['kmeans', 'hierarchical', 'LC', 'NMF'],
#      'ensemble': 'tunedNMF'
#     },
]

In [None]:
%timeit
approach_result_columns = ['approach', 'test', 'seed', 'clustering', 'ensemble', 'hitrate']
approach_result_data = []

cluster_start = 2
cluster_end = 6
for seed in range(0, 5):
    print(f'\nSEED {seed}')
    for a, approach in enumerate(approaches):
        print(f'\nApproach {a+1} --', end="")
        for test in tests:
            print(f' {test}', end="")
            data = tests[test]['data']
            if 'true_group_list' not in tests[test]:
                group_info = tests[test]['group_info']
                true_group_list = generate_true_group_list(group_info)
                tests[test]['true_group_list'] = true_group_list
            actual_num_of_segs = len(true_group_list.unique())

            if approach['ensemble']:
                # Cluster
                cluster_solutions = cluster(data, 
                                   seed=seed, 
                                   n_cluster_start=cluster_start,
                                   n_cluster_end=cluster_end,
                                   methods=approach['cluster'])

                # Ensemble
                recoded_solutions = recode(cluster_solutions)
                ensemble_solution = cluster(recoded_solutions,
                                            seed=seed,
                                            n_cluster_start=actual_num_of_segs,
                                            n_cluster_end=actual_num_of_segs,
                                            methods=[approach['ensemble']])[f"{approach['ensemble']}_{actual_num_of_segs}"]

                hitrate = get_solution_hit_rate(true_group_list, ensemble_solution)

            else:  # cluster only
                # Cluster
                solution = cluster(data, 
                                   seed=seed, 
                                   n_cluster_start=actual_num_of_segs,
                                   n_cluster_end=actual_num_of_segs,
                                   methods=approach['cluster'])

                hitrate = get_solution_hit_rate(true_group_list, 
                                                solution[f"{approach['cluster'][0]}_{actual_num_of_segs}"])

            approach_result_data.append([a+1,  # approach number
                                         test,
                                         seed,
                                         "/".join(approach['cluster']),  # clustering algorithm/s
                                         approach['ensemble'] if approach['ensemble'] else 'None',  # ensemble algorithm (if any)
                                         hitrate
                                        ])  
approach_results = pd.DataFrame(approach_result_data, columns=approach_result_columns)
approach_results


SEED 0

Approach 1 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 2 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 3 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 4 -- test1 test2 test3 test4 test5 test7 test8 testMike
SEED 1

Approach 1 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 2 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 3 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 4 -- test1 test2 test3 test4 test5 test7 test8 testMike
SEED 2

Approach 1 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 2 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 3 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 4 -- test1 test2 test3 test4 test5 test7 test8 testMike
SEED 3

Approach 1 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 2 -- test1 test2 test3 test4 test5 test7 test8 testMike
Approach 3 -- test1 test2 test3 test4 test5 test7 test8 t

In [13]:
approach_results.to_clipboard()

In [16]:
summary = pd.DataFrame([approach_results.groupby(['test', 'seed'])['hitrate'].mean(), approach_results.groupby(['test', 'seed'])['hitrate'].std()]).T
summary.columns = ['mean_hitrate', 'std_hitrate']
summary.to_clipboard()

In [121]:
approach_results[approach_results['test'] == 'testMike']

Unnamed: 0,approach,test,clustering,ensemble,hitrate
7,1,testMike,kmeans,,0.477
15,2,testMike,LC,,0.317
23,3,testMike,NMF,,0.32
31,4,testMike,tunedNMF,,0.344
39,5,testMike,kmeans/hierarchical,kmeans,0.406
47,6,testMike,kmeans/hierarchical,LC,0.397
55,7,testMike,kmeans/hierarchical,NMF,0.411
63,8,testMike,kmeans/hierarchical,tunedNMF,0.424
71,9,testMike,kmeans/hierarchical/LC/NMF,kmeans,0.399
79,10,testMike,kmeans/hierarchical/LC/NMF,LC,0.392


In [115]:
approach_results.to_clipboard()

In [114]:
approach_results.groupby(['clustering', 'ensemble'])['hitrate'].mean()

test   clustering                  ensemble
test1  LC                          None        58.2
       NMF                         None        65.8
       kmeans                      None        39.1
       kmeans/hierarchical         LC          35.3
                                   NMF         35.3
                                               ... 
test8  kmeans/hierarchical/LC/NMF  LC          42.1
                                   NMF         44.2
                                   kmeans      42.8
                                   tunedNMF    44.2
       tunedNMF                    None        85.7
Name: hitrate, Length: 84, dtype: float64

## Hitrate Calculation

In [39]:
test = 'test7'
true_group_list = tests[test]['true_group_list']
predicted = tests[test]['solutions']['kmeans6']

"""
Returns maximum possible hitrate for a solution
by matching up predicted and true segments based on
highest number of resps predicted
"""
crosstab = pd.crosstab(true_group_list, predicted)
# true predictions / all predictions
total_predicted = crosstab.sum()
possible_hitrates = crosstab / total_predicted
true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
n = crosstab.sum().sum()

for _ in range(len(true_predictions)): # for every true segment
    next_best_hitrate = get_iloc_max(crosstab)
    
    num_correct = crosstab.iloc[next_best_hitrate]
    if num_correct != 0:
        # capture true predictions
        true_predictions[next_best_hitrate[0]] = num_correct

    # zero out true and predicted segments (so they don't get picked again)
    crosstab.iloc[next_best_hitrate[0], :] = 0
    crosstab.iloc[:, next_best_hitrate[1]] = 0

sum(true_predictions) / n

0.833

In [38]:
num_correct

200

In [33]:
crosstab = pd.crosstab(true_group_list, predicted)
# true predictions / all predictions
total_predicted = crosstab.sum()
possible_hitrates = crosstab / total_predicted
true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
n = crosstab.sum().sum()

In [34]:
next_best_hitrate = get_iloc_max(crosstab)

# capture true predictions
num_correct = crosstab.iloc[next_best_hitrate]
true_predictions[next_best_hitrate[0]] = num_correct

In [35]:
get_iloc_max(crosstab)

(3, 4)

In [29]:
sum(true_predictions)

833

In [26]:
get_iloc_max(crosstab)

(0, 0)

In [25]:
next_best_hitrate = get_iloc_max(crosstab)

# capture true predictions
num_correct = crosstab.iloc[next_best_hitrate]
true_predictions[next_best_hitrate[0]] = num_correct

# zero out true and predicted segments (so they don't get picked again)
crosstab.iloc[next_best_hitrate[0], :] = 0
crosstab.iloc[:, next_best_hitrate[1]] = 0

crosstab

kmeans6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0


kmeans6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,185,0,0,0,115
2,33,17,0,0,0,0
3,0,1,0,99,0,0
4,0,0,0,0,200,0
5,0,0,150,0,0,0
6,199,0,0,0,1,0


## Calculating RMSE

In [None]:
rmse(get_group_data(tests['test5'], 1), get_predicted_group_data(tests['test5'], 1, tests['test5']['ensemble_solutions']['NMF3']))

0.9901403225756431

In [None]:
true_group_list = tests['test7']['true_group_list']
predicted = tests['test7']['ensemble_solutions']['NMF6']

crosstab = pd.crosstab(true_group_list, predicted)
print(crosstab)
# true predictions / all predictions
total_predicted = crosstab.sum()
possible_hitrates = crosstab / total_predicted
true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
n = crosstab.sum().sum()

for _ in range(len(true_predictions)): # for every true segment
  next_best_hitrate = get_iloc_max(crosstab)
  print(f'next_best_hitrate at {get_iloc_max(crosstab)}')
  
  # capture true predictions
  num_correct = crosstab.iloc[next_best_hitrate]
  true_predictions[next_best_hitrate[0]] = num_correct

  # zero out true and predicted segments (so they don't get picked again)
  crosstab.iloc[next_best_hitrate[0], :] = 0
  crosstab.iloc[:, next_best_hitrate[1]] = 0
sum(true_predictions) / n

NMF6     0    1    2    3    4    5
row_0                              
1      120    0    0    0  180    0
2        0    0    0   50    0    0
3        0    0    0    0    0  100
4        0  200    0    0    0    0
5        0    0  150    0    0    0
6        0    0    0  200    0    0
next_best_hitrate at (3, 1)
next_best_hitrate at (5, 3)
next_best_hitrate at (0, 4)
next_best_hitrate at (4, 2)
next_best_hitrate at (2, 5)
next_best_hitrate at (0, 0)


0.65

In [None]:
crosstab

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0


In [None]:
get_iloc_max(crosstab)

(0, 0)

In [None]:
crosstab / total_predicted

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
total_predicted

NMF6
0    120
1    200
2    150
3    250
4    180
5    100
dtype: int64

In [None]:
crosstab = pd.crosstab(tests['test7']['true_group_list'], tests['test7']['ensemble_solutions']['NMF6'])

# Export Data

In [None]:
tests[test].keys()

dict_keys(['group_info', 'data', 'true_group_list', 'solutions', 'recoded_solutions', 'ensemble_solutions'])

In [None]:
tests[test]['solutions']

Unnamed: 0,kmeans3,hierarchical3,NMF3,kmeans4,hierarchical4,NMF4,kmeans5,hierarchical5,NMF5
0,2,1,0,2,0,0,2,0,2
1,0,1,2,0,0,3,4,4,2
2,2,1,2,2,0,0,4,4,1
3,2,1,0,2,0,0,2,0,1
4,2,1,0,2,0,0,2,0,2
...,...,...,...,...,...,...,...,...,...
995,0,1,1,3,0,2,4,0,3
996,0,0,1,3,1,2,0,1,3
997,2,1,1,2,0,2,2,0,3
998,0,0,1,3,1,2,0,1,3


In [23]:
for test in tests:
    tests[test]['true_group_list'].to_excel(f'{test}_true_segments.xlsx')
    tests[test]['data'].to_excel(f'{test}_data.xlsx')
    tests[test]['solutions'].to_excel(f'{test}_python_solutions.xlsx')
    tests[test]['ensemble_solutions'].to_excel(f'{test}_python__ensemble_solutions.xlsx')

## Open it Up
[Segment Prefs - Google Sheet](https://docs.google.com/spreadsheets/d/1Hm5IeGfxHc8gKeFM-7tt3E-TmvlGLTcafbSQpNcpW0s/edit?usp=sharing)