# Sessions model: fit the gmm

In [None]:
s3_bucket_folder_address = '' # contained the location of our data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import boto3
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture as GMM
import scipy.cluster.hierarchy as shc
import pickle

## Subset of data

In [2]:
full_data = pd.read_csv(s3_bucket_folder_address+'sessions2019.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [3]:
year='2019'
labeled_drivers = pd.read_csv(s3_bucket_folder_address+'2019_driver_labels_revised.csv', index_col=0)#2019_applied_other_years/driver_labels_'+year+'.csv', index_col=0)

In [7]:
driver_list = labeled_drivers['Unique Driver ID'].values

In [8]:
data = full_data[full_data['Driver ID'].isin(driver_list)]

## Create GMMs

In [9]:
def process_df(df2):
    
    df2.loc[df2.index, 'start'] = np.clip(df2.loc[df2.index, 'start_seconds'].values, 0, 24*60*60)
    df2.loc[df2.index, 'energy'] = np.clip(df2.loc[df2.index, 'Energy (kWh)'].values, 0, 100)
    df2.loc[df2.index, 'duration'] = np.clip(df2.loc[df2.index, 'Session Time (secs)'].values, 0, 48*60*60)
    df2.loc[df2.index, 'location'] = 'other'
    df2.loc[df2[df2['POI Category']=='Single family residential'].index, 'location'] = 'home'
    df2.loc[df2[df2['POI Category']=='Workplace'].index, 'location'] = 'work'
    df2.loc[df2[df2['POI Category'].isin(['Multifamily Commercial', 'Multifamily Home Service'])].index, 'location'] = 'mud'
    df2.loc[df2.index, 'weekend'] = 'weekday'
    df2.loc[df2[df2['start_weekday'].isin([5, 6])].index, 'weekend'] = 'weekend'
    df2.loc[df2.index, 'fast'] = 'slow'
    df2.loc[df2[df2['Max Power']>20].index, 'fast'] = 'fast'
    
    return df2

In [None]:
def fit_gmm(subset, name_str, bucket, bucket_folder, sed=True):
    if sed:
        mat = subset[['start','energy','duration']].values
    else:
        mat = subset[['start','energy']].values
    
    fit_data = {'num_components':[3, 4, 5, 6, 7, 8, 9, 10], 'aic':[], 'bic':[]}
    for num_components in [3, 4, 5, 6, 7, 8, 9, 10]:
        gm = GaussianMixture(n_components=num_components).fit(mat)
        fit_data['aic'].append(gm.aic(mat))
        fit_data['bic'].append(gm.bic(mat))
        save(gm, 'GMMs', name_str+str(num_components), bucket, bucket_folder)
        
    pd.DataFrame(fit_data).to_csv('s3://'+bucket+'/'+bucket_folder+name_str+str(num_components)+'_aicbic.csv')
        
    return

# Fit all

In [10]:
data = process_df(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [23]:
bucket = ''
bucket_folder = ''
for clusters in np.arange(0, 16):
    print('-----Clusters: ', clusters,'------')
    clustnamestr = 'cluster_'+str(clusters)
    
    print('Loading and processing data')
    driver_list = labeled_drivers[labeled_drivers['Agglom Cluster Number']==clusters]['Unique Driver ID'].values
    data = full_data[full_data['Driver ID'].isin(driver_list)].copy(deep=True).reset_index(drop=True)
    data = process_df(data)
    
    print('-----Fitting GMMs----')
    
    for loc in ['home', 'other', 'work', 'mud']:
        print('Location ', loc)
        subset = data[data['location']==loc]
        if len(subset) > 0:
            if loc=='other':
                speed_set = ['fast', 'slow']
            else:
                speed_set = ['slow']
            for speed in speed_set:
                print('Speed ', speed)
                subset1 = subset[subset['fast']==speed]
                if len(subset1) > 0:
                    for weekend in ['weekday','weekend']:
                        print('Weekday ', weekend)
                        subset2 = subset1[subset1['weekend']==weekend]
                        if len(subset2) > 0:
                            name_str = clustnamestr+loc+'_'+speed+'_'+weekend+'_sed_'
                            fit_gmm(subset2, name_str, bucket, bucket_folder, sed=True)


    

-----Clusters:  0 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Speed  slow
Weekday  weekday
Weekday  weekend
Location  other
Speed  fast
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
Speed  slow
Weekday  weekday
Weekday  weekend
-----Clusters:  1 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Speed  slow
Weekday  weekday
Weekday  weekend
Location  other
Speed  fast
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
-----Clusters:  2 ------
Loading and processing data
-----Fitting GMMs----
Location  home
Speed  slow
Weekday  weekday
Weekday  weekend
Location  other
Speed  fast
Weekday  weekday
Weekday  weekend
Speed  slow
Weekday  weekday
Weekday  weekend
Location  work
Speed  slow
Weekday  weekday
Weekday  weekend
Location  mud
Speed  slow
Weekday  weekday
Weekday  weekend
-----Clusters:  3 -

# Select K

For each segment and driver group, go through and use the AIC curve to select the optimal number of clusters for that case. Save the result in 'results' and remove the other GMMs from the data folder.

In [None]:
results = pd.DataFrame(np.zeros((5, 16)), index=['home', 'mud', 'work', 'other_fast', 'other_slow'], columns=['cluster_'+str(i) for i in range(16)])

In [None]:
# clust_num = 0
clust_num += 1
print(clust_num)
location_speed = 'mud_slow'#other_fast'
location = 'mud'#other_fast'
weekday = 'weekend'
fit_data = pd.read_csv(s3_bucket_folder_address+'cluster_'+str(clust_num)+location_speed+'_'+weekday+'_sed_8_aicbic.csv', index_col=0)
plt.figure()
plt.plot(fit_data['num_components'], fit_data['aic'])
plt.show()

In [None]:
results.loc[location, 'cluster_'+str(clust_num)] = 8

In [None]:
results.to_csv(s3_bucket_folder_address+'weekend_ncomp_dict.csv')