In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import json
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import altair as alt

In [None]:
file_names = ['eatingSocial','workplaceMinutes','TravelMinutes','totalActivities',\
              'LeaveBackTime','HomeMinutes','Expense_Income']
data = {}
for name in file_names:
  f = open(f'/content/drive/MyDrive/Datasets/VAST_Challenge_2022/{name}.json')
  data[name] = json.load(f)
f.close()

In [None]:
config = {
    'SAVE_DIR':'/content/drive/MyDrive/Datasets/VAST_Challenge_2022',
    'Income_Expenses':{
        'Expenses':'Average Expenses',
        'Income':'Average Income'
    },
    'Pipeline':{
        'Patterns':{
            'Avoid':['pId']
        }
    },
    'TSNE':{
        'perplexity' : [10,20,30,40,50,60,70,80,90]
    }
}

In [None]:
def fillAverageExpenseIncome(df,quantile):
  df_expenseNonNan = df.loc[~df[config['Income_Expenses']['Expenses']].isna()]
  df_incomeNonNan = df.loc[~df[config['Income_Expenses']['Income']].isna()]
  expenseDescr = df_expenseNonNan[config['Income_Expenses']['Expenses']].describe()
  incomeDescr = df_expenseNonNan[config['Income_Expenses']['Income']].describe()
  df.loc[df[config['Income_Expenses']['Expenses']].isna(),config['Income_Expenses']['Expenses']] = expenseDescr[quantile]
  df.loc[df[config['Income_Expenses']['Income']].isna(),config['Income_Expenses']['Income']] = incomeDescr[quantile]
  return df

In [None]:
def removeCols(df):
  df = df[[i for i in df.columns if i not in config['Pipeline']['Patterns']['Avoid']]]
  return df.to_numpy()

In [None]:
def makeTSNEdf(res):
  tsne_result = pd.DataFrame(res,columns=['X1','X2'])
  tsne_result = pd.concat((pd.DataFrame({'pID':list(range(0,1011))}),tsne_result),axis = 1)
  return tsne_result

In [None]:
def plotTSNE(df,perplexity):
  plot = alt.Chart(df).mark_circle().encode(
      x = 'X1',
      y = 'X2',
      color = alt.Color('Cluster',scale = 
                        alt.Scale(range = ['green','orange','red'],
                                  domain = [1,2,3]))
  ).properties(
      title = f'Perplexity {perplexity}',
      height = 500,
      width = 500
  )
  # .configure_title(
  #     fontSize=20
  # )
  return plot

In [None]:
def assignClusters(df):
  df["Cluster"] = 1
  df.loc[df['X1'] < -25,'Cluster'] = 2
  df.loc[(df['X2'] > 20) & (df['X1'] > 15),'Cluster'] = 3
  return df

In [None]:
df_eatingSocial = pd.DataFrame({'Eating':[j['Eating'] for i,j in data['eatingSocial'].items()],\
                                'Recreating':[j['Recreation (Social Gathering)'] for i,j in data['eatingSocial'].items()]})
df_workplace = pd.DataFrame({'WorkPlace':[j for i,j in data['workplaceMinutes'].items()]})
df_travelMinutes = pd.DataFrame({'TravelMinutes':[j for i,j in data['TravelMinutes'].items()]})
df_activities = pd.DataFrame({'NumActivities':[j['NumActivities'] for i,j in data['totalActivities'].items()]})
df_homeMinutes = pd.DataFrame({'Home':[j for i,j in data['HomeMinutes'].items()]})
df_LeaveBack = pd.DataFrame({'Leave Home':[j[0] for i,j in data['LeaveBackTime'].items()],\
                             'Come Back':[j[1] for i,j in data['LeaveBackTime'].items()]})
df_joviality = pd.read_csv(Path(r"/content/drive/MyDrive/Datasets/VAST_Challenge_2022/Attributes/Participants.csv").resolve())[['joviality']]
df_expenseIncome = pd.DataFrame(data['Expense_Income'])
df_ids = pd.DataFrame({"pId":list(range(0,1011))})
df_expenseIncome = fillAverageExpenseIncome(df_expenseIncome,'50%')

In [None]:
df_patterns = pd.concat((df_ids,df_eatingSocial,df_workplace,df_travelMinutes,df_activities,df_homeMinutes,df_LeaveBack,df_joviality),axis = 1)

In [None]:
tsne_results = {}
removecols_transform = FunctionTransformer(removeCols)
for perp in config['TSNE']['perplexity']:
  pipeline = Pipeline([('removeCols',removecols_transform),('scale',StandardScaler()),('tsne',TSNE(n_components = 2,perplexity = perp))])
  res = pipeline.fit_transform(df_patterns)
  tsne_res = makeTSNEdf(res)
  tsne_res = assignClusters(tsne_res)
  tsne_results[f'Perplexity:{perp}'] = tsne_res

In [None]:
rows = alt.vconcat()
for row_num in [0,1,2]:
  cols = alt.hconcat()
  for col_num in range(row_num*3,(row_num+1)*3):
    perp = config['TSNE']['perplexity'][col_num]
    plot = plotTSNE(tsne_results[f'Perplexity:{perp}'],perp)
    cols |= plot
  rows &= cols
rows

In [None]:
# for result in tsne_results:
#   tsne_results[result].to_csv(Path(config['SAVE_DIR'],f'{result}.csv').resolve(),index = None)

In [None]:
df = tsne_results['Perplexity:40']
# parallel_coords_dict = {}
# parallel_coords_dict['Cluster1'] = df.loc[df['Cluster'] == 1].sample(4).reset_index(drop = True)
# parallel_coords_dict['Cluster2'] = df.loc[df['Cluster'] == 2].sample(4).reset_index(drop = True)
# parallel_coords_dict['Cluster3'] = df.loc[df['Cluster'] == 3].sample(4).reset_index(drop = True)

In [None]:
parallel_coords_df = df[['pID','Cluster']].merge(df_patterns,how = 'left',left_on = 'pID',right_on = 'pId')
# parallel_coords_df.to_csv(Path(config['SAVE_DIR'],'parallelCoords.csv').resolve(),index = None)

In [None]:
parallel_coords_df

Unnamed: 0,pID,Cluster,pId,Eating,Recreating,WorkPlace,TravelMinutes,NumActivities,Home,Leave Home,Come Back,joviality
0,0,1,0,21.02,63.89,343.466667,224.566667,5.08,787.951002,380.000000,1128.136646,0.001627
1,1,1,1,14.80,90.06,343.466667,150.688889,5.31,835.588889,455.000000,1005.496894,0.328087
2,2,1,2,19.11,51.38,343.466667,295.777778,5.25,724.777283,360.000000,987.888199,0.393470
3,3,1,3,20.18,83.98,343.466667,168.600000,5.21,824.587973,385.000000,1023.322981,0.138063
4,4,1,4,19.33,105.78,347.055556,107.422222,5.71,861.703786,420.279503,978.478261,0.857397
...,...,...,...,...,...,...,...,...,...,...,...,...
1006,1006,1,1006,30.89,63.96,343.466667,110.033333,6.23,886.000000,455.000000,985.000000,0.639268
1007,1007,1,1007,29.60,127.28,343.466667,65.977778,7.27,871.144444,435.000000,955.000000,0.934348
1008,1008,1,1008,19.69,31.08,347.044444,145.966667,4.04,895.968820,345.000000,1013.245342,0.163721
1009,1009,1,1009,30.39,71.71,347.044444,108.388889,6.62,875.944444,490.000000,1029.161491,0.828330


In [None]:
parallel_coords_df

Unnamed: 0,pID,Cluster,pId,Eating,Recreating,WorkPlace,TravelMinutes,NumActivities,Home,Leave Home,Come Back,joviality
0,0,1,0,21.02,63.89,343.466667,224.566667,5.08,787.951002,380.000000,1128.136646,0.001627
1,1,1,1,14.80,90.06,343.466667,150.688889,5.31,835.588889,455.000000,1005.496894,0.328087
2,2,1,2,19.11,51.38,343.466667,295.777778,5.25,724.777283,360.000000,987.888199,0.393470
3,3,1,3,20.18,83.98,343.466667,168.600000,5.21,824.587973,385.000000,1023.322981,0.138063
4,4,1,4,19.33,105.78,347.055556,107.422222,5.71,861.703786,420.279503,978.478261,0.857397
...,...,...,...,...,...,...,...,...,...,...,...,...
1006,1006,1,1006,30.89,63.96,343.466667,110.033333,6.23,886.000000,455.000000,985.000000,0.639268
1007,1007,1,1007,29.60,127.28,343.466667,65.977778,7.27,871.144444,435.000000,955.000000,0.934348
1008,1008,1,1008,19.69,31.08,347.044444,145.966667,4.04,895.968820,345.000000,1013.245342,0.163721
1009,1009,1,1009,30.39,71.71,347.044444,108.388889,6.62,875.944444,490.000000,1029.161491,0.828330


In [None]:
df_expenseIncome_patterns = parallel_coords_df.merge(df_expenseIncome, how = 'left',right_on = 'pId',left_on = 'pID')

In [None]:
df_expenseIncome_patterns.drop(['pId_x','pId_y'],axis = 1,inplace = True)

In [None]:
df_expenseIncome_patterns['Average Expenses'] = df_expenseIncome_patterns['Average Expenses'] * -1

In [None]:
parallel_coords = pd.concat((parallel_coords_dict[i] for i in parallel_coords_dict),axis = 0)

In [None]:
# parallel_coords_df = parallel_coords.merge(df_patterns,how = 'left',left_on = 'pID',right_on = 'pId')
df_expenseIncome_patterns.to_csv(Path(config['SAVE_DIR'],'parallelCoordsExpense.csv').resolve(),index = None)

In [None]:
rrrrr

In [None]:
df_patterns

In [None]:
description = df_expenseIncome_patterns.describe()

In [None]:
description

Unnamed: 0,pID,Cluster,Eating,Recreating,WorkPlace,TravelMinutes,NumActivities,Home,Leave Home,Come Back,joviality,Average Expenses,Average Income
count,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0,1011.0
mean,505.0,1.175074,24.606954,72.827933,359.405973,129.789724,5.338694,860.146788,445.274243,1031.468914,0.493794,1419.282881,4115.635721
std,291.994863,0.455996,9.544449,61.169494,42.603684,55.547935,1.094023,82.648105,44.389733,48.469539,0.291351,348.155367,2126.711133
min,0.0,1.0,3.77,0.0,242.5,14.477778,1.98,571.25,290.0,905.093168,0.000204,518.381183,1848.377004
25%,252.5,1.0,19.87,49.0,343.466667,90.388889,4.6,815.905902,415.0,998.335921,0.240074,1228.405275,2742.954872
50%,505.0,1.0,21.38,70.99,343.466667,121.333333,5.33,869.488889,445.0,1030.0,0.477539,1440.365198,3592.758721
75%,757.5,1.0,30.48,90.92,347.044444,160.455556,6.2,908.731688,480.0,1063.237578,0.746819,1619.53292,4695.463174
max,1010.0,3.0,65.0,745.0,517.5,392.922222,8.5,1100.0,535.0,1195.0,0.999234,2666.18947,17368.976426


In [None]:
min_max_dict = {}
min_max_dict['Type'] = [i for i in description.columns if i!='pId']
min_max_dict['Min'] = [description.loc['min',:][i] for i in description.loc['min',:].keys() if i != 'pId']
min_max_dict['Max'] = [description.loc['max',:][i] for i in description.loc['max',:].keys() if i != 'pId']
dfminmax = pd.DataFrame(min_max_dict)#.to_csv(Path(config['SAVE_DIR'],'parallelCoordsMinMax.csv').resolve(),index = None)

In [None]:
minmax_json = {}
for i in range(dfminmax.shape[0]):
  minmax_json[dfminmax.loc[i,'Type']] = {'Min':dfminmax.loc[i,'Min'],'Max':dfminmax.loc[i,'Max']}

In [None]:
minmax_json

In [None]:
with open(Path(config['SAVE_DIR'],'minmax.json').resolve(),'w') as f:
  json.dump(minmax_json,f)