# <span style="font-size:3.5rem">Convergent Clusters</span>

#### imports

In [None]:
!pip install wNMF
!pip install gspread_dataframe



In [1]:
from scipy import stats # stats
from scipy.stats import binom
import numpy as np # math
import matplotlib.pyplot as plt # graph
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from wNMF import wNMF
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt

from IPython.display import display, Markdown, Latex # to display formatted output

#from google.colab import files
import io

def generate_responses_norm(num_samples, desired_mean, desired_std_dev, round=False, seed=0):
  np.random.seed(seed)
  samples = np.random.normal(loc=0.0, scale=desired_std_dev, size=num_samples)

  actual_mean = np.mean(samples)
  actual_std = np.std(samples)

  zero_mean_samples = samples - (actual_mean)
  zero_mean_mean = np.mean(zero_mean_samples)
  zero_mean_std = np.std(zero_mean_samples)

  scaled_samples = zero_mean_samples * (desired_std_dev/zero_mean_std)
  scaled_mean = np.mean(scaled_samples)
  scaled_std = np.std(scaled_samples)

  final_samples = scaled_samples + desired_mean
  if round:
    final_samples = final_samples.round()
  final_mean = np.mean(final_samples)
  final_std = np.std(final_samples)
  return final_samples

def generate_group_data_norm(group, desired_std_dev, round=False, seed=0):
  all_responses = [generate_responses_norm(group['size'], desired_mean, 
                                           desired_std_dev, round=round, seed=seed)
                   for desired_mean in group['means']]
  return np.array(all_responses).T


def generate_survey_data_norm(groups, desired_std_dev, round=False, seed=0):
  all_data = []
  for group in groups:
    group_data = generate_group_data_norm(groups[group], desired_std_dev, round=round, seed=seed)
    all_data += group_data.tolist()
  return pd.DataFrame(np.array(all_data))


def get_group_data(test, group):
  group_info = test['group_info']
  cum_sum_of_sizes = np.cumsum([0]+[group_info[group]['size'] for group in group_info])
  data = test['data']
  group_start = cum_sum_of_sizes[group-1]
  group_end = cum_sum_of_sizes[group]
  return data.iloc[group_start:group_end]


def describe_groups(data, group_info):
  cum_sum_of_sizes = np.cumsum([0]+[group_info[group]['size'] for group in group_info])
  all_stats = []
  index = []
  for group in group_info:
    start = cum_sum_of_sizes[group-1]
    end = cum_sum_of_sizes[group]
    all_stats += data.iloc[start:end].describe().loc[['mean', 'std']].values.tolist()
    index += [f'group{group} mean', f'group{group} std']
  df = pd.DataFrame(all_stats, index = index)
  new_order = []
  for metric in ['mean', 'std']:
    for group in group_info:
      new_order += [f'group{group} {metric}']
  return df.loc[new_order]

def get_iloc_max(df):
  return np.unravel_index(np.argmax(df.values), np.array(df.values).shape)
  
def get_hit_rates(crosstab):
    # for each true segment, finds the predicted segment with greatest hitrate
    # allows overlap
    # hit_rates=[]
    # for true_seg in crosstab.index:
    #     max_loaded = crosstab.loc[true_seg].idxmax()
    #     true = crosstab.loc[true_seg, max_loaded]
    #     predicted = crosstab[max_loaded].sum()
    #     hit_rate = true/predicted
    #     hit_rates.append(hit_rate.round(3))
    # hit_rates = pd.Series(hit_rates, index = crosstab.index)

    # same as above, however doesn't allow overlap.
    # highest hitrate matchup is locked in
    total_predicted = crosstab.sum()
    possible_hitrates = crosstab / total_predicted
    hitrates = [0 for _ in range(len(possible_hitrates.index))]

    for _ in range(len(hitrates)): # for every true segment
      next_best_hitrate = get_iloc_max(possible_hitrates)

      # capture hitrate
      hitrate = possible_hitrates.iloc[next_best_hitrate]
      hitrates[next_best_hitrate[0]] = hitrate

      # zero out true and predicted segments (so they don't get picked again)
      possible_hitrates.iloc[next_best_hitrate[0], :] = 0
      possible_hitrates.iloc[:, next_best_hitrate[1]] = 0
    return hitrates
def get_solution_hit_rate(true, predicted):
    """
    Returns maximum possible hitrate for a solution
    by matching up predicted and true segments based on
    highest number of resps predicted
    """
    crosstab = pd.crosstab(true_group_list, predicted)
    # true predictions / all predictions
    total_predicted = crosstab.sum()
    possible_hitrates = crosstab / total_predicted
    true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
    n = crosstab.sum().sum()

    for _ in range(len(true_predictions)): # for every true segment
      next_best_hitrate = get_iloc_max(crosstab)
      
      # capture true predictions
      num_correct = crosstab.iloc[next_best_hitrate]
      true_predictions[next_best_hitrate[0]] = num_correct

      # zero out true and predicted segments (so they don't get picked again)
      crosstab.iloc[next_best_hitrate[0], :] = 0
      crosstab.iloc[:, next_best_hitrate[1]] = 0

    return sum(true_predictions) / n



def display_hitrates(true_group_list, solutions, display_matrix = True):
  cmap = sns.light_palette("green", as_cmap=True) #sns.diverging_palette(220, 20, sep=100, as_cmap=True).reversed()
  number_of_segments_in_original = len(true_group_list.unique())

  for solution in solutions:
      number_of_segments_in_solution = len(solutions[solution].unique())
      if number_of_segments_in_original != number_of_segments_in_solution:
        print(f'{solution} -- Hit Rate not shown. (Does not have {number_of_segments_in_original} segments)')
        continue
      predicted_segments = solutions[solution]
      hit_rate = get_solution_hit_rate(true_group_list, predicted_segments)

      # hit_rates = get_hit_rates(crosstab)
      # avg_hit_rate = np.mean(hit_rates)
      crosstab = pd.crosstab(true_group_list, solutions[solution])
      
      display(Markdown(f'### {solution} -- Hit Rate = {hit_rate:.1%}'))
      if display_matrix:
        display(crosstab.style.background_gradient(cmap=cmap))

def recode(solutions):
  coded_solutions = pd.DataFrame(index=solutions.index)

  for solution in solutions:
      segments = sorted(solutions[solution].unique())
      for segment in segments:
          is_segment = (solutions[solution] == segment).astype(int)
          coded_solutions[f'{solution}_{segment}'] = is_segment

  return coded_solutions

def rmse(actual_group, predicted_group):
  errors = predicted_group.mean() - actual_group.mean();
  squared_errors = errors ** 2
  mean_squared_error = squared_errors.mean()
  root_mean_squared_error = np.sqrt(mean_squared_error)
  return root_mean_squared_error

def get_predicted_group_data(test, group, solution):
  true_group_list = test['true_group_list']
  actual_group_data = get_group_data(test, 1)
  all_data = tests['test1']['data']

  crosstab = pd.crosstab(true_group_list, solution);crosstab
  corresponding_predicted_segment = crosstab.loc[group].idxmax()

  predicted_group_data = all_data[solution == corresponding_predicted_segment]
  return predicted_group_data

def cluster(data, true_group_list, methods=['kmeans', 'hierarchical', 'NMF'], n_cluster_start=2, n_cluster_end=5):
  solutions = pd.DataFrame()
  for i in range(n_cluster_start, n_cluster_end+1):
    if 'kmeans' in methods:
      kmeans = KMeans(n_clusters=i,).fit(data)
      solutions[f'kmeans{i}'] = kmeans.labels_
      #print(solutions['kmeans'].value_counts().sort_index())

    if 'hierarchical' in methods:
      heirachical = AgglomerativeClustering(n_clusters=i).fit(data)
      solutions[f'hierarchical{i}'] = heirachical.labels_
      #solutions[f'heirarchical{i}'].value_counts().sort_index()

    if 'NMF' in methods:
      data_positive = data - data.min(axis=0).min()
      nmf = wNMF(n_components=i, max_iter=2000, beta_loss='kullback-leibler', verbose=0, random_state=0).fit(data_positive.values.astype('float'), W=np.ones(data.shape),n_run=5)
      solutions[f'NMF{i}'] = pd.DataFrame(nmf.components_).idxmax(axis=1)
      #solutions['NMF'].value_counts().sort_index()
      #display(Markdown(f'# {test}'))
      #display_hitrates(true_group_list, solutions)
  return solutions

def generate_true_group_list(group_info):
  true_group_list = []
  for group in group_info:
    true_group_list += [group] * group_info[group]['size']
  true_group_list = pd.Series(true_group_list)
  return true_group_list

In [2]:
tests = {}
round = True
seed = 0

#### Test #1:  extreme group sizes, standard deviation of error=1.5

In [None]:
tests['test1'] = {
      'group_info': {
        1: {
            'means': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1],
            'size': 100
          },
        2: {
            'means': [2, 3, 1, 2, 3, 1, 2, 3, 1, 2],
            'size': 300
          },
        3: {
            'means': [3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
            'size': 600
          },
  }
}

std_dev = 1.5

tests['test1']['data'] = generate_survey_data_norm(tests['test1']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test1']['data'], tests['test1']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667
group3 mean,2.998333,0.998333,1.998333,2.998333,0.998333,1.998333,2.998333,0.998333,1.998333,2.998333
group1 std,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998
group2 std,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247
group3 std,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729


#### Test #2:  moderately different-sized groups, standard deviation of error=2

In [None]:
std_dev = 2
new_sizes = [200, 300, 500]
tests['test2'] = {'group_info': {
                        group: {
                        'means': tests['test1']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test1']['group_info']}
                  }

tests['test2']['data'] = generate_survey_data_norm(tests['test2']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test2']['data'], tests['test2']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.045,2.045,3.045,1.045,2.045,3.045,1.045,2.045,3.045,1.045
group2 mean,2.026667,3.026667,1.026667,2.026667,3.026667,1.026667,2.026667,3.026667,1.026667,2.026667
group3 mean,3.014,1.014,2.014,3.014,1.014,2.014,3.014,1.014,2.014,3.014
group1 std,2.035606,2.035606,2.035606,2.035606,2.035606,2.035606,2.035606,2.035606,2.035606,2.035606
group2 std,2.021446,2.021446,2.021446,2.021446,2.021446,2.021446,2.021446,2.021446,2.021446,2.021446
group3 std,2.010445,2.010445,2.010445,2.010445,2.010445,2.010445,2.010445,2.010445,2.010445,2.010445


#### Test #3:  equal groups, standard deviation of error=2

In [None]:
std_dev = 2
new_sizes = [333, 333, 334]
tests['test3'] = {'group_info': {
                        group: {
                        'means': tests['test1']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test1']['group_info']}
                }

tests['test3']['data'] = generate_survey_data_norm(tests['test3']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test3']['data'], tests['test3']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.024024,2.024024,3.024024,1.024024,2.024024,3.024024,1.024024,2.024024,3.024024,1.024024
group2 mean,2.024024,3.024024,1.024024,2.024024,3.024024,1.024024,2.024024,3.024024,1.024024,2.024024
group3 mean,3.026946,1.026946,2.026946,3.026946,1.026946,2.026946,3.026946,1.026946,2.026946,3.026946
group1 std,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272
group2 std,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272,2.03272
group3 std,2.025926,2.025926,2.025926,2.025926,2.025926,2.025926,2.025926,2.025926,2.025926,2.025926


#### Test #4:  group 3 now overlaps with 1 and 2

In [None]:
tests['test4'] = tests['test1']
tests['test4']['group_info'][3] = {
            'means': [2, 2, 1, 1, 3, 3, 2, 2, 1, 1],
            'size': 600
}

std_dev = 1.5

tests['test4']['data'] = generate_survey_data_norm(tests['test4']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test4']['data'], tests['test4']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0,1.0
group2 mean,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667
group3 mean,1.998333,1.998333,0.998333,0.998333,2.998333,2.998333,1.998333,1.998333,0.998333,0.998333
group1 std,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998
group2 std,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247
group3 std,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729


#### Test #5:  test 4 but groups 1 and 3 switch sizes

In [None]:
new_sizes = [600, 300, 100]
tests['test5'] = {'group_info': {
                        group: {
                        'means': tests['test4']['group_info'][group]['means'],
                        'size': new_sizes[group-1]
                        }
                        for group in tests['test4']['group_info']}
                  }
std_dev = 1.5

tests['test5']['data'] = generate_survey_data_norm(tests['test5']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test5']['data'], tests['test5']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,0.998333,1.998333,2.998333,0.998333,1.998333,2.998333,0.998333,1.998333,2.998333,0.998333
group2 mean,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667,3.006667,1.006667,2.006667
group3 mean,2.0,2.0,1.0,1.0,3.0,3.0,2.0,2.0,1.0,1.0
group1 std,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729,1.51729
group2 std,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247
group3 std,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998


In [3]:
tests['test7'] = {
      'group_info': {
        1: {
            'means': [6, 4, 4, 1, 10, 4, 6, 1, 7, 1],
            'size': 300
          },
        2: {
            'means': [4, 5, 8, 5, 5, 8, 7, 3, 5, 2],
            'size': 50
          },
        3: {
            'means': [10, 4, 4, 2, 5, 10, 7, 3, 4, 8],
            'size': 100
          },
        4: {
            'means': [5, 2, 2, 8, 8, 5, 2, 4, 3, 1],
            'size': 200
          },
        5: {
            'means': [2, 3, 4, 9, 2, 5, 5, 10, 4, 10],
            'size': 150
          },
        6: {
            'means': [2, 5, 10, 6, 7, 10, 9, 9, 3, 4],
            'size': 200
          },

  }
}

std_dev = 1.5

tests['test7']['data'] = generate_survey_data_norm(tests['test7']['group_info'], std_dev, round=round, seed=seed)
describe_groups(tests['test7']['data'], tests['test7']['group_info'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
group1 mean,6.006667,4.006667,4.006667,1.006667,10.006667,4.006667,6.006667,1.006667,7.006667,1.006667
group2 mean,4.0,5.0,8.0,5.0,5.0,8.0,7.0,3.0,5.0,2.0
group3 mean,10.0,4.0,4.0,2.0,5.0,10.0,7.0,3.0,4.0,8.0
group4 mean,5.0,2.0,2.0,8.0,8.0,5.0,2.0,4.0,3.0,1.0
group5 mean,1.993333,2.993333,3.993333,8.993333,1.993333,4.993333,4.993333,9.993333,3.993333,9.993333
group6 mean,2.0,5.0,10.0,6.0,7.0,10.0,9.0,9.0,3.0,4.0
group1 std,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247,1.532247
group2 std,1.511858,1.511858,1.511858,1.511858,1.511858,1.511858,1.511858,1.511858,1.511858,1.511858
group3 std,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998,1.556998
group4 std,1.526977,1.526977,1.526977,1.526977,1.526977,1.526977,1.526977,1.526977,1.526977,1.526977


### Cluster

In [5]:
for test in tests:
    display(Markdown(f'# {test}'))
    data = tests[test]['data']
    group_info = tests[test]['group_info']
    true_group_list = generate_true_group_list(group_info)
    tests[test]['true_group_list'] = true_group_list
    tests[test]['solutions'] = cluster(data, true_group_list, n_cluster_start=6,
                                     n_cluster_end=6, methods=
                                     ['kmeans', 'hierarchical', 'NMF'])#, 'NMF'])  # 'kmeans', 'hierarchical', 
    display(Markdown(f"## {test} Original Solutions"))
    display_hitrates(true_group_list, tests[test]['solutions'], display_matrix=True)
    recoded_solutions = recode(tests[test]['solutions'])
    tests[test]['recoded_solutions'] = recoded_solutions
    display(Markdown(f"## {test} Ensemble Solutions"))
    ensemble_solutions = cluster(recoded_solutions, true_group_list, n_cluster_start=6, n_cluster_end=6)
    display_hitrates(true_group_list, ensemble_solutions)
    tests[test]['ensemble_solutions'] = ensemble_solutions


# test7

## test7 Original Solutions

### kmeans6 -- Hit Rate = 64.8%

kmeans6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,185,0,0,0,115
2,33,17,0,0,0,0
3,0,1,0,99,0,0
4,0,0,0,0,200,0
5,0,0,150,0,0,0
6,199,0,0,0,1,0


### hierarchical6 -- Hit Rate = 65.0%

hierarchical6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,180,0,120
2,50,0,0,0,0,0
3,0,0,0,0,100,0
4,0,200,0,0,0,0
5,0,0,150,0,0,0
6,200,0,0,0,0,0


### NMF6 -- Hit Rate = 100.0%

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,300,0
2,0,0,0,0,0,50
3,100,0,0,0,0,0
4,0,0,200,0,0,0
5,0,150,0,0,0,0
6,0,0,0,200,0,0


## test7 Ensemble Solutions

### kmeans6 -- Hit Rate = 65.0%

kmeans6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,180,0,0,0,120
2,0,0,0,50,0,0
3,0,0,0,0,100,0
4,200,0,0,0,0,0
5,0,0,150,0,0,0
6,0,0,0,200,0,0


### hierarchical6 -- Hit Rate = 65.0%

hierarchical6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,180,0,0,0,120
2,50,0,0,0,0,0
3,0,0,0,100,0,0
4,0,0,200,0,0,0
5,0,0,0,0,150,0
6,200,0,0,0,0,0


### NMF6 -- Hit Rate = 65.0%

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,120,180
2,0,0,50,0,0,0
3,100,0,0,0,0,0
4,0,0,0,200,0,0
5,0,150,0,0,0,0
6,0,0,200,0,0,0


## Calculating RMSE

In [None]:
rmse(get_group_data(tests['test5'], 1), get_predicted_group_data(tests['test5'], 1, tests['test5']['ensemble_solutions']['NMF3']))

0.9901403225756431

In [None]:
true_group_list = tests['test7']['true_group_list']
predicted = tests['test7']['ensemble_solutions']['NMF6']

crosstab = pd.crosstab(true_group_list, predicted)
print(crosstab)
# true predictions / all predictions
total_predicted = crosstab.sum()
possible_hitrates = crosstab / total_predicted
true_predictions = [0 for _ in range(len(possible_hitrates.index))] 
n = crosstab.sum().sum()

for _ in range(len(true_predictions)): # for every true segment
  next_best_hitrate = get_iloc_max(crosstab)
  print(f'next_best_hitrate at {get_iloc_max(crosstab)}')
  
  # capture true predictions
  num_correct = crosstab.iloc[next_best_hitrate]
  true_predictions[next_best_hitrate[0]] = num_correct

  # zero out true and predicted segments (so they don't get picked again)
  crosstab.iloc[next_best_hitrate[0], :] = 0
  crosstab.iloc[:, next_best_hitrate[1]] = 0
sum(true_predictions) / n

NMF6     0    1    2    3    4    5
row_0                              
1      120    0    0    0  180    0
2        0    0    0   50    0    0
3        0    0    0    0    0  100
4        0  200    0    0    0    0
5        0    0  150    0    0    0
6        0    0    0  200    0    0
next_best_hitrate at (3, 1)
next_best_hitrate at (5, 3)
next_best_hitrate at (0, 4)
next_best_hitrate at (4, 2)
next_best_hitrate at (2, 5)
next_best_hitrate at (0, 0)


0.65

In [None]:
crosstab

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0


In [None]:
get_iloc_max(crosstab)

(0, 0)

In [None]:
crosstab / total_predicted

NMF6,0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
total_predicted

NMF6
0    120
1    200
2    150
3    250
4    180
5    100
dtype: int64

In [None]:
crosstab = pd.crosstab(tests['test7']['true_group_list'], tests['test7']['ensemble_solutions']['NMF6'])

# Export Data

In [None]:
tests[test].keys()

dict_keys(['group_info', 'data', 'true_group_list', 'solutions', 'recoded_solutions', 'ensemble_solutions'])

In [None]:
tests[test]['solutions']

Unnamed: 0,kmeans3,hierarchical3,NMF3,kmeans4,hierarchical4,NMF4,kmeans5,hierarchical5,NMF5
0,2,1,0,2,0,0,2,0,2
1,0,1,2,0,0,3,4,4,2
2,2,1,2,2,0,0,4,4,1
3,2,1,0,2,0,0,2,0,1
4,2,1,0,2,0,0,2,0,2
...,...,...,...,...,...,...,...,...,...
995,0,1,1,3,0,2,4,0,3
996,0,0,1,3,1,2,0,1,3
997,2,1,1,2,0,2,2,0,3
998,0,0,1,3,1,2,0,1,3


In [6]:
for test in tests:
    tests[test]['true_group_list'].to_excel(f'{test}_true_segments.xlsx')
    tests[test]['data'].to_excel(f'{test}_data.xlsx')
    tests[test]['solutions'].to_excel(f'{test}_python_solutions.xlsx')
    tests[test]['ensemble_solutions'].to_excel(f'{test}_python__ensemble_solutions.xlsx')

ModuleNotFoundError: No module named 'openpyxl'

## Open it Up
[Segment Prefs - Google Sheet](https://docs.google.com/spreadsheets/d/1Hm5IeGfxHc8gKeFM-7tt3E-TmvlGLTcafbSQpNcpW0s/edit?usp=sharing)