In [1]:
num_clusters = 9  # *fill in with selected number of driver clusters*
data_folder = 'Folder/'  # *fill in with data location*
year = '2019'

# Sessions model: fit the gmm

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import pickle
import os
import shutil

In [17]:
# make new folders
if not os.path.isdir('NewData'):
    os.mkdir('NewData')
if not os.path.isdir('NewData/AllGMMs'):
    os.mkdir('NewData/AllGMMs')
if not os.path.isdir('NewData/GMMs'):
    os.mkdir('NewData/GMMs')

# Prepare Data

In [6]:
full_data = pd.read_csv(data_folder+'sessions'+year+'.csv', index_col=0)
labeled_drivers = pd.read_csv(data_folder+'sessions'+year+'_driverdata_unscaled_withlabels.csv', index_col=0)

In [11]:
def process_df(df2):
    
    df2.loc[df2.index, 'start'] = np.clip(df2.loc[df2.index, 'start_seconds'].values, 0, 24*60*60)
    df2.loc[df2.index, 'energy'] = np.clip(df2.loc[df2.index, 'Energy (kWh)'].values, 0, 100)
    df2.loc[df2.index, 'duration'] = np.clip(df2.loc[df2.index, 'Session Time (secs)'].values, 0, 48*60*60)
    df2.loc[df2.index, 'location'] = 'other'
    df2.loc[df2[df2['Category']=='Single family residential'].index, 'location'] = 'home'
    df2.loc[df2[df2['Category']=='Workplace'].index, 'location'] = 'work'
    df2.loc[df2[df2['Category'].isin(['Multifamily Home Service'])].index, 'location'] = 'mud'
    df2.loc[df2.index, 'weekend'] = 'weekday'
    df2.loc[df2[df2['start_weekday'].isin([5, 6])].index, 'weekend'] = 'weekend'
    df2.loc[df2.index, 'fast'] = 'slow'
    df2.loc[df2[df2['Max Power']>20].index, 'fast'] = 'fast'
    
    return df2

## Create GMMs

In [21]:
def fit_gmm(subset, name_str, folder):
    
    mat = subset[['start', 'energy', 'duration']].values
    num = np.shape(mat)[0]
    if num > 1:
        fit_data = {'num_components':[], 'aic':[]}
        for num_components in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
            if num_components < num:
                gm = GaussianMixture(n_components=num_components).fit(mat)
                fit_data['aic'].append(gm.aic(mat))
                fit_data['num_components'].append(num_components)
                pickle.dump(gm, open(folder+name_str+'_'+str(num_components)+'components_gmm.p', "wb"))        
        pd.DataFrame(fit_data).to_csv(folder+name_str+'_aic.csv')
        
    return

In [22]:
for cluster in np.arange(0, num_clusters):
    print('-----Cluster: ', cluster,'------')
    clustnamestr = 'cluster_'+str(cluster)+'_'
    
    print('Loading and processing data')
    driver_list = labeled_drivers[labeled_drivers['Agglom Cluster Number']==cluster]['Unique Driver ID'].values
    data = full_data[full_data['Driver ID'].isin(driver_list)].copy(deep=True).reset_index(drop=True)
    data = process_df(data)
    
    print('-----Fitting GMMs----')
    
    for loc in ['home', 'other', 'work', 'mud']:
        print('Location ', loc)
        subset = data[data['location']==loc]
        if len(subset) > 0:
            if loc=='other':
                speed_set = ['fast', 'slow']
            else:
                speed_set = ['slow']
            for speed in speed_set:
                print('Speed ', speed)
                subset1 = subset[subset['fast']==speed]
                if len(subset1) > 0:
                    for weekend in ['weekday','weekend']:
                        print('Weekday ', weekend)
                        subset2 = subset1[subset1['weekend']==weekend]
                        if len(subset2) > 0:
                            name_str = clustnamestr+loc+'_'+speed+'_'+weekend+'_'
                            fit_gmm(subset2, name_str, 'NewData/AllGMMs/')



-----Cluster:  0 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Location  other
Speed  fast
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
-----Cluster:  1 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Location  other
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
-----Cluster:  2 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Speed  slow
Weekday  weekday
Weekday  weekend
Location  other
Speed  fast
Weekday  weekday
Weekday  weekend
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
-----Cluster:  3 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Location  other
Speed  fast
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
-----Cluster:  4 ------
Lo

# Reopen each and select optimal K

In [None]:
# walk through each option:
# note, will not exist for segments that do not exist in the data

clust_num = 8
clustnamestr = 'cluster_'+str(clust_num)+'_'
# weekday = 'weekday'
weekday = 'weekend'

# loc = 'mud'
# location_speed = 'mud_slow'
# loc = 'home'
# location_speed = 'home_slow'
# loc = 'work'
# location_speed = 'work_slow'
# loc = 'other'
# location_speed = 'other_slow'
loc = 'other'
location_speed = 'other_fast'

name_str = clustnamestr+location_speed+'_'+weekday+'_'

fit_data = pd.read_csv('NewData/AllGMMs/'+name_str+'_aic.csv', index_col=0)
plt.figure()
plt.plot(fit_data['num_components'], fit_data['aic'])
plt.show()

In [144]:
# from the plot, select the optimal number of clusters
selection = 5


# copies file over into main data folder
if loc == 'other':
    shutil.copyfile('NewData/AllGMMs/'+name_str+'_'+str(selection)+'components_gmm.p', 'NewData/GMMs/'+weekday+'_'+location_speed+'_'+str(clust_num)+'.p')
else:
    shutil.copyfile('NewData/AllGMMs/'+name_str+'_'+str(selection)+'components_gmm.p', 'NewData/GMMs/'+weekday+'_'+loc+'_'+str(clust_num)+'.p')