# Model Results

This notebook performs clustering runs on various k-values and bias amounts, then produces model results for further analysis.

## Load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import sys
sys.path.append('/content/drive/My Drive/Trending-Topics-Dashboard-main')

In [7]:
import numpy as np
import pandas as pd
from Biased_Clusters import get_silhouette

In [8]:
# cleaned data
df = pd.read_csv('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/data_cleaned.csv')

In [9]:
import json
import scipy.sparse

# load training data
x_vector = np.load('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/x_vector.npy')

# load terms sparse matrix
terms_sparse_matrix = scipy.sparse.load_npz('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_sparse_matrix.npz')

# load terms label
with open("/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

## Run KMeans Models on Various k-values and Bias Amounts

In [10]:
def build_result_data(df, x_vector, n_clusters, max_range=1000):
    # run k-mean model on various bias amounts and build the result dataframe
    data = []
    for m in tqdm(range(1, max_range,10)):
        m = m*.01
        #print(m)
        try:
            data.append(get_silhouette(df, x_vector, m, n_clusters))
        except ValueError:
            continue
    
    # create a data frame of result
    df_result = pd.DataFrame(data)
    
    return df_result

In [11]:
from tqdm import tqdm

# create an empty list to store the result data frame
result_list = []
k_values = []             # a list of k-values
avg_sil_scores = []       # a list of average Silhouette score per k-value

# run KMeans model on 25 different k-values
for n_clusters in range(5, 31):
    # get model result and save to a list
    df_result = build_result_data(df, x_vector, n_clusters, 2000)
    result_list.append(df_result)
    
    # compute average Silhouette score for each k value
    k_values.append(n_clusters)
    avg_sil_scores.append(df_result['Silhouette Score'].mean())

100%|██████████| 200/200 [02:25<00:00,  1.38it/s]
100%|██████████| 200/200 [02:33<00:00,  1.31it/s]
100%|██████████| 200/200 [02:38<00:00,  1.26it/s]
100%|██████████| 200/200 [02:50<00:00,  1.17it/s]
100%|██████████| 200/200 [04:05<00:00,  1.23s/it]
100%|██████████| 200/200 [04:24<00:00,  1.32s/it]
100%|██████████| 200/200 [04:35<00:00,  1.38s/it]
100%|██████████| 200/200 [04:57<00:00,  1.49s/it]
100%|██████████| 200/200 [03:33<00:00,  1.07s/it]
100%|██████████| 200/200 [03:44<00:00,  1.12s/it]
100%|██████████| 200/200 [03:52<00:00,  1.16s/it]
100%|██████████| 200/200 [03:57<00:00,  1.19s/it]
100%|██████████| 200/200 [05:44<00:00,  1.72s/it]
100%|██████████| 200/200 [05:49<00:00,  1.75s/it]
100%|██████████| 200/200 [05:54<00:00,  1.77s/it]
100%|██████████| 200/200 [06:28<00:00,  1.94s/it]
100%|██████████| 200/200 [06:46<00:00,  2.03s/it]
100%|██████████| 200/200 [06:56<00:00,  2.08s/it]
100%|██████████| 200/200 [06:57<00:00,  2.09s/it]
100%|██████████| 200/200 [07:15<00:00,  2.18s/it]


## Save Results

In [None]:
# save the result data frames to csv files
for n_clusters in range(5, 31):
    result_list[n_clusters-5].to_csv('results/results_' + str(n_clusters) + '.csv', index=False)

In [None]:
# save average Silhouette score per k
df_avg_sil = pd.DataFrame(dict({'Number of Topics': k_values, 'Average Silhouette Score': avg_sil_scores}))
df_avg_sil.to_csv('results/avg_sil_per_k.csv', index=False)