In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import logging
import collections
import datetime
from datetime import datetime as dt
from tslearn.clustering import TimeSeriesKMeans



In [2]:
LOG = logging.getLogger(__name__)
logging.getLogger().setLevel(logging.INFO)

logf = open("error.log", "w") #error log file

In [3]:
'''
DESCRIPTION:
Loading data from the csv to dataframe. Only 3 columns are kept 
for initial conderstanding: entry_date, rounded_time,price

@Input: 
    filepath: file containing electricity usage

@Output: dataframe containing all entries of input file for 3 columns

'''

def data_load(filepath):
    try:
        logging.info('Loading data from file')
        
        data = pd.read_csv(filepath)
        data = data[['entry_date','rounded_time','price']]
        data['entry_date'] = pd.to_datetime(data['entry_date'])
    
        return data
    
    except Exception as e:
        logf.write("Failed to load the data from {0}: {1}\n".format(str(filepath), str(e)))
        LOG.error(traceback.format_exc())
        sys.exit(1)

In [4]:
'''
DESCRIPTION:
To bring all prices of current data in same scale

@Input:
    data: dataframe

@Ouput: normalized prices in the dataframe

'''

def normalize_df(data):
    try:
        return data.apply(lambda s: s / data.max(axis=1)).fillna(0)
    
    except Exception:
        logf.write("Failed to normalized the data {0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    

In [5]:
'''
DESCRIPTION:
Converting the data in the pivot table format to manipulate it as time series.
Selecting 6 months data from the start date. 
Returning normalized data

@Input: 
    data: dataframe containing complete data
    start_date, end_date: date range of the data    

@Output: 6 months- (normalized data, pivoted data )

'''

def data_preprocessing(data, start_date, end_date):
    
    try:
        #Selecting the range i.e. 6 months
        ranged_data =  data.loc[(data['entry_date'] >= start_date) & (data['entry_date'] < end_date)]
        pivoted = ranged_data.pivot_table(index="entry_date", columns="rounded_time", values="price", fill_value=0)

        #normalizing the data
        data_norm = (normalize_df(pivoted.loc[:, pivoted.columns.difference(["rounded_time"])]).round(3))
        return pivoted, data_norm
    
    except Exception:
        logf.write("Failed in pre-processing the data {0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    

In [6]:
'''
DESCRIPTION: 
Plots cluster centroids.

@Input: 
    data: dataframe containing 6 months of data
    clust: centroid for the data

'''
def plot_cluster_centroids(data, clust, lw=4, alpha=0.6):
    try:
        fontsize = 15
        data.assign(clust=clust).groupby("clust").mean().T.plot(ax=plt.gca(), lw=lw, alpha=alpha)

        plt.title("Cluster Centroids", fontsize=fontsize+5)
        plt.yticks(fontsize=fontsize);
        plt.legend(title="Cluster centroids:", loc="upper left")
        plt.grid()
    
    except Exception as E:
        logf.write("Failed in plotting cluster centroid of the the data {0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)            

In [7]:
'''
DESCRIPTION:
Plot one subplot per cluster along with cluster centroid
in decreasing order of cluster size percentage

'''
def plot_clustered_profiles(df, clust, n_cols=3, alpha=0.2):
    try:
        weekly = False if df.shape[1] < 168 else True

        #calculating cluster size
        clust_perc = 100 * clust.value_counts(normalize=True)

        n_rows = np.ceil(clust.nunique() / n_cols)
        fontsize = 15
        fig = plt.figure(figsize=[15, n_rows*4])

        for i, clust_n in enumerate(clust_perc.index):
            ax = fig.add_subplot(n_rows, n_cols, i+1)
            df_plot = df[clust == clust_n]

            step = 10 if df_plot.shape[0] > 500 else 1  # plot less profiles

            plt.plot(df_plot.iloc[::step].T.values, alpha=alpha, color="red")
            df_plot.mean().plot(ax=plt.gca(), alpha=1, color="k", legend=False);

            plt.title("clust: {}, perc: {:.1f}%".format(clust_n, 
                                                        clust_perc.loc[clust_n]), 
                                                        fontsize=fontsize+5);
            plt.yticks(fontsize=fontsize);
            plt.grid()

        plt.tight_layout()     
    except Exception as e:
        logf.write("Failed in ploting clustered data{0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    

In [15]:
'''
DESCRIPTION: Applying KMeans on 6 months data

@Input: data to be clustered

@Output: clustersed data

'''
def train(X):
    try:
        logging.info('Clustering data')
        
        model = TimeSeriesKMeans(n_clusters=6, metric="softdtw", max_iter=10, random_state= 20).fit(X)
        clusters = pd.Series(model.labels_, index=X.index)    
        return clusters
    except Exception as e:
        logf.write("Failed in clustering data{0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    

### Approach

1. In every iteration 6 months data is been considered, based on start date and end date.

2. After preprocessing data, this 6 month data is divided in 6 clusters (as mentioned in the problem statement)

3. Clustering is done for 26 iterations after shifting the data of previous iteration by one week. To understand the stability, cluster has to be mapped across every iteration. 
    * **To perform which cluster in current iteration is same as cluster in previous iteration, I considered the size of every cluster in adjacent iterations. Clusters which has largest size in $iteration1$ will be same as cluster having same rank in size in $iteration2$**. Clusters in every iteration is sorted on the basis of size and largest cluster of iteration1 is mapped to largest cluster of iteration2, second largest cluster of iteration1 is mapped to second largest cluster of iteration2, so on and so forth. 
    * This is maintained in a dictionary(name=mapping) as a list of list. Key value of this dictionary indicates the size rank i.e. key = 0 holds value of cluster number and centroid having largest size in every iteration, key =1 holds value of cluster number and centroid having second largest size in every iteration.
    
    **Example**: 
    
    mapping = {"0" : [[$iteration1$ largest  size cluster number, centroid],[$iteration2$ largest size cluster number, centroid} ]....[$iteration26$ largest size cluster number, centroid ]],     
    
    "1": [[$iteration1$ second largest size cluster number, centroid ],[$iteration2$ second largest size cluster number, centroid ].....[$iteration26$ second largest size cluster number, centroid]], 
    ..... }
    
   <br>     
    * After atleast one shift is completed, to calculate the stability/movement **euclidean distance** is calculated between cluster centroid of adjacent iteration belonging to same rank in size or same key of "mapping" dictionary. These values are stored in another dictionary where key again represent rank of cluster in size (0 means largest size). This distance combines complete movement between 2 same cluster when shifted by $n$ number of weeks from starting date.


4. Now we have distance for all 6 clusters based on size. To map these distance with cluster number, for very cluster I considered its rank as per first iteration and mapped it to distance values stored in the "movement" dictionary.


5. Output dictionary holds the movement for a very cluster from $iteration1$ to $iteration26$. More the value for a cluster, more is the change in the cluster structure after 26 iterations


In [30]:
'''
DESCRIPTION: Creating the mapping between clusters size across iteration and movement
@Input: Complete data

'''
def rank(data):

    mapping = collections.defaultdict(list) #adding mapping between cluster number and its centroid  after every iteration
    movement = collections.defaultdict(float) #storing movement of cluster based on size
    iteration = 26
    
    try:
        first_date = data['entry_date'][0] #date of first entry in the data file

        for itrt in range(iteration):

            #shifting data by 1 week
            week = datetime.timedelta(days=7*itrt)
            start_date = first_date + week
            end_date = start_date + pd.tseries.offsets.DateOffset(months=6) #end date of 6 months from start date

            # converting date as per data in csv
            end_date = datetime.datetime.strptime(str(end_date), "%Y-%m-%d %H:%M:%S")

            #processing 6 months data
            pivoted,preprocessed_data = data_preprocessing(data,start_date,end_date)

            #creating clusters for every 6 month data
            clusters = train(preprocessed_data)

            #To plot all subplot for every iteration and centroid. 
            # To save computation time, below line is commented
            # Sample plot is shown below
            #plot_clustered_profiles(pivoted, clusters) 
            #plot_cluster_centroids(preprocessed_data, clusters)


            # Associating cluster number and percentage size of corresponding cluster in decreasing order, 
            # format: "cluster number percentage size"
            clust_perc = 100 * clusters.value_counts(normalize=True)

            # identify centroid value for every cluster. 
            # Centroid will be a time series, respresenting all time series of that cluster.
            centroid_list = preprocessed_data.assign(clusters=clusters).groupby("clusters").mean()

            cluster_num = clust_perc.index.values # list of cluster numbers in descending order based on the size.

            for index,clust_num in enumerate(cluster_num):
                centroid = centroid_list.iloc[clust_num] # storing centroid of a particular cluster number
                mapping[index].append([clust_num,centroid]) #index =0 hold clusters of max size, index=1 clusters of second largest size

                #After data is shifted alteast once by one week
                if itrt > 0:
                    prev_iteration = mapping[index][-1][1]
                    cur_iteration = mapping[index][-2][1]

                    #taking euclidean distance between 2 cluster centroid which are mapping same in two adjacent iterations
                    distance = np.linalg.norm(prev_iteration-cur_iteration)
                    movement[index] += distance #storing value of difference between 2 clusters considered as same as data shift
            
        return mapping,movement
        
    except Exception as e:
        logf.write("Failed calculating stability of data{0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    


        

In [39]:
# mapping complete movement/shift value across 26 iterations
# with it's corresponding cluster value

def movement(mapping,movement):
    try:
        output = {} #key = cluster number, value = complete movement from iteration 1 till 26
        for pos,key in enumerate(mapping):
            output[mapping[key][0][0]] = movement[pos]
    
        return(output)
    
    except Exception as e:
        logf.write("Failed to create mapping for cluster number and movement{0}\n".str(e))
        LOG.error(traceback.format_exc())
        sys.exit(1)    


In [42]:
data = data_load('Downloads/dataset_dollarhide.csv')
mapping,distance = rank(data)
output = movement(mapping,distance)
print(output)

{0: 0.9347173669873775, 4: 30.3392098804596, 2: 132.09642852564184, 1: 98.06706371677495, 3: 169.27808595196987, 5: 0.0}


## Plots

#### Clustered data for one iteration

<img src="/clustered_data.png">

#### Centroid for one iteration

<img src="/centroid.png">

**Other approaches which I tried to track clusters across iteration**:
    
   **Appproach 1**: In very iteration storing the centroid and calculate the difference of every cluster of $iteration1$ with every cluster $iteraion2$, but this was mapping all cluster with same cluster of next adjacent iteration.
    
   **Appproach 2**: Train KMeans model on one iteration, store it. Train model on next iteration. Identify the centroid and store them. Using these centroid and previous adjacent iteration model to predict cluster number. It has the same problem as approach 1.
    
    
    

**Future Enhancement:**

1. Using multiprocessing to run create seperate process for different 6 months data based on CPU/GPU core
2. Using LSTM to better understand timeseries data