In [None]:
import pandas as pd
import numpy as np
import time
import datetime
import math
import copy

%matplotlib inline 
#now plot will be displayed in notebook
import matplotlib.pyplot as plt
import os

a=0.05
colour_map='tab20'

In [None]:
cluster_per_day = 7 #counting messages per x days
input_data_path = 'input'
save_data_path = 'output'

#cat_= number of category_to_score
#methods 0:cat_, 1:other_, 2:total, 3:other_ - cat_, 4:other_/(cat_+1), 5:(cat_)/(other_+1), 6:(cat_)/(total)
#category: 0:'fromAVF', 1:'Normal', 2:'NC', 3:'Other', 4:'STOP' use encoding['Colour'] to check encoding - look in last box at the bottom of the notebook :)
category_to_score = 0
method_to_score = 2

### Load & adjust Features

In [None]:
def edit_location_scheme(dataFrame):
    location_schemes=['Mogadishu Sub-District','Somalia District','Somalia Region','Somalia State','Somalia Zone']

    def location(label):
        if label == label:
            label = copy.deepcopy(label)
            if label[0]['Scheme'] in location_schemes:
                for i in range(0,len(label)): #len(label) = 5
                    label[i]['Scheme'] = 'Location' 
        return label
    dataFrame.loc[:,'Labels'] = dataFrame.loc[:,'Labels'].apply(location)
    return dataFrame

def get_Message_number(dataFrame, name):
    participants = dataFrame.loc[:,'ParticipantUUID'].value_counts().keys()
    SMS_nums = pd.Series([], dtype=int)
    for p in participants:
        SMS_nums=SMS_nums.append(get_participant_details(dataFrame, p).loc[:,name])
    
    return SMS_nums.sort_index()

def get_week_boundaries(dataSeries): #has to contain timestamp data
    data_format = '%m-%d-%Y'
    d_min = dataSeries.min()
    d_max = dataSeries.max()
    d = d_min
    step = datetime.timedelta(days=cluster_per_day)
    week_bounds = []
    while  d < d_max:
        week_bounds.append(d)
        d += step
    week_bounds.append(d_max)
    return week_bounds

def add_weeks(dataSeries):
    week_bounds = get_week_boundaries(dataSeries)
    def get_weeks(timeStamp):
        week = 0
        first = week_bounds[0]
        for w in week_bounds[1:]:
            week+=1
            if timeStamp == first: #to avoid the first time being a separate week
                return week
            elif timeStamp <= w:
                return week
        return week #this line should never be run as no timeStamp > week_bounds[-1] (as this iss the max timeStamp)
    return dataSeries.apply(get_weeks)

def add_labels(dataFrame, label_choice):
    new_series = []
    for i in range(0, dataFrame.shape[0]):
        if dataFrame.loc[i,'Direction'] == 'fromAVF':
            new_series.append('fromAVF')
        else: #Direction == 'fromParticipant'
            labels = dataFrame.loc[i,'Labels']
            if label_choice == 'Scheme':
                new_series.append(labels[0][label_choice]) #works evern if multiple Schemes as all the multiple Schemes are the same value (the change is in the Label (in the Labels))
            else:
                temp = []
                for j in range (0, len(labels)):
                    temp.append(labels[j][label_choice])
                new_series.append(temp)
    return pd.Series(new_series)

In [None]:
def set_up_features(dataframe):
    new_dataframe = dataframe.drop('Text', axis=1)
    new_dataframe.replace('in', 'fromParticipant', inplace=True)
    new_dataframe.replace('out', 'fromAVF', inplace=True)
    new_dataframe = edit_location_scheme(new_dataframe)
    questions, encoding['Questions'] = new_dataframe.loc[:,'TextTranslation'].factorize()
    new_dataframe['questionType']=questions
    new_dataframe['Week'] = add_weeks(new_dataframe.loc[:,'Timestamp'])
    new_dataframe['SMSPerParticipant'] = get_Message_number(new_dataframe, 'SMSPerParticipant')
    new_dataframe['Scheme'] = add_labels(new_dataframe, 'Scheme')
    new_dataframe['LabelType'] = add_labels(new_dataframe, 'LabelType')
    new_dataframe['Label'] = add_labels(new_dataframe, 'Label')
    new_dataframe = new_dataframe.drop('Labels', axis=1)
    return new_dataframe

## Graph Plotting Methods

In [None]:
def plot_labels_scheme(data, xs, ys, colours):
    X = data[xs]
    y = data[ys]
    c = np.array(data.loc[:,'Colour'])
    end = len(colours.keys())
    fig, ax = plt.subplots(figsize=(24, 12))
    for l in colours.keys():
        ix = np.where(c == l)
        print(l)
        ax.scatter(X.array[ix], y.array[ix], c=np.array([colours[l]]), label = l, marker = "o", alpha=a)
    ax.tick_params(labelrotation=90)
    ax.legend()
    ax.set_yticklabels([])
    plt.xlabel(xs)
    plt.ylabel(ys)

    plt.show()
    
def plot_labels_scheme_only_participants(data, xs, ys, colours):
    X = data[xs]
    y = data[ys]
    c = np.array(data.loc[:,'Colour'])
    end = len(colours.keys())
    fig, ax = plt.subplots(figsize=(24, 12))
    for l in colours.keys():
        ix = np.where(c == l)
        print(l)
        if l != 'fromAVF':
            ax.scatter(X.array[ix], y.array[ix], c=np.array([colours[l]]), label = l, marker = "o", alpha=a)
    ax.tick_params(labelrotation=90)
    ax.legend()
    ax.set_yticklabels([])
    plt.xlabel(xs)
    plt.ylabel(ys)

    plt.show()

def plot_labels_scheme_sep(data, xs, ys, colours):
    X = data[xs]
    y = data[ys]
    c = np.array(data.loc[:,'Colour'])
    end = len(colours.keys())
    fig, ax = plt.subplots(end, 1, figsize=(24, 6*end))
    count=0
    for l in colours.keys():
        ix = np.where(c == l)
        print(l)
        ax[count].scatter(X.array[ix], y.array[ix], c=np.array([colours[l]]), label = l, marker = "o", alpha=a)
        ax[count].tick_params(labelrotation=90)
        ax[count].legend()
        ax[count].set_yticklabels([])
        count+=1
    plt.xlabel(xs)
    plt.ylabel(ys)

    plt.show()
    
def plot_no_colour(data, xs, ys):
    X = data[xs]
    y = data[ys]
    fig, ax = plt.subplots(figsize=(24, 12))
    ax.scatter(X.array, y.array, marker = "o", alpha=a)
    ax.tick_params(labelrotation=90)
    ax.set_yticklabels([])
    plt.xlabel(xs)
    plt.ylabel(ys)

    plt.show()
    

### Useful tools

In [None]:
def select(data, feature, value):
    return data.loc[data.loc[:,feature]==value,:].copy()

def get_participant_details(data, p):
    p_messages = select(data, 'ParticipantUUID', p)
    p_messages['SMSPerParticipant'] = p_messages.sort_values(by='Timestamp',ignore_index=True).index
    return p_messages

def get_paticient_seq(dataFrame, column):
    participants = dataFrame.loc[:,'ParticipantUUID'].value_counts().keys()
    weeks=get_week_boundaries(dataFrame.loc[:,'Timestamp'])
    encoding={}
    encColumn = dataFrame.loc[:,column].factorize()
    encoding[column]=encColumn[1]
    encC='enc'+ column
    dataFrame[encC]=encColumn[0]
    seq={}
    for p in participants:
        p_df = get_participant_details(dataFrame, p)
        w_seq = []
        for i in range(1, len(weeks)):
            w_p_df = select(p_df, 'Week', i)
            w_seq.append(list(w_p_df.loc[:,encC].values))
        seq[p] = w_seq
    return seq, encoding

def join_participant_seq(weekly_seq):
    seq = {}
    for p in weekly_seq.keys():
        p_seq = []
        w_p_seq = weekly_seq[p]
        for w in w_p_seq:
            for i in w:
                p_seq.append(int(i))
        seq[p] = p_seq
    return seq

### Scoring

In [None]:
def average_weekly_score(dataFrame, seq, category, method):
    scores={}
    weeks=get_week_boundaries(dataFrame.loc[:,'Timestamp'])
    for p in seq.keys():
        participant = seq[p] 
        score = []
        for i in range(1, len(weeks)):
            index=i-1
            total = len(participant[index])
            if total == 0:
                score.append(0)
            else:
                cat_ = participant[index].count(category)
                other_ = total - cat_
                if method == 0:
                    score.append(cat_)
                elif method == 1:
                    score.append(other_)
                elif method == 2:
                    score.append(total)
                elif method == 3:
                    score.append(other_ - cat_)
                elif method == 4:
                    score.append((other_)/(cat_+1))
                elif method == 5:
                    score.append((cat_)/(other_+1))
                elif method == 6:
                    if not total:
                        score.append((cat_)/(total))
                    else:
                        score.append(total)
                elif method == 7:
                    if not total:
                        score.append((other_)/(total))
                    else:
                        score.append(total)
                else:
                    raise Exception('That is not a valid scoring method')
        scores[p] = score
    return scores

### Distance matrix

In [None]:
### ordered list from distance matrix
from scipy.spatial.distance import euclidean, pdist, squareform

def similarity(u, v):
    return euclidean(u,v) 

def distance_matrix(dataFrame, scores):
    df = pd.DataFrame.from_dict(scores)
    df.index+=1
    df=df.T

    dist = pdist(df, similarity)
    dist_df = pd.DataFrame(squareform(dist), columns=df.index, index=df.index)
    dist_df.to_numpy()[tuple([np.arange(dist_df.shape[0])]*2)]=np.inf
    return dist_df

### Ordering

In [None]:
def reorder_participant(dataFrame, ordering):
    order={}
    for i in range(0,len(ordering)):
        order[ordering[i]]=i
    def Order(o):
        return order[o]
    dataFrame['Order']=dataFrame.loc[:,'ParticipantUUID'].apply(Order)
    return dataFrame

#### Naïve minimum ordering

In [None]:
def get_min_order(dist_df):
    participant=dist_df.keys()[0]
    order=[participant]
    while(dist_df.shape[0]-1):
        dist_df = dist_df.drop(participant)
        participant = dist_df.loc[:,participant].idxmin()
        order.append(participant)
    return order

#### Heirarchical Clustering

In [None]:
class Participant:
    def __init__(self, participant):
        self.id = participant
        self.neighbors={}
    
    def __str__(self):
        return str(self.id+' neighbours: '+str([x.id for x in self.neighbors]))
    
    def add_neighbor(self, neighbor, weight=0): #the weight = limbLength and is initilised as 0
        self.neighbors[neighbor]=weight
        
    def get_weight(self, neighbor):
        if neighbor in self.neighbors:
            return self.neighbors[neighbor]
        else:
            return None
    def get_id(self):
        return self.id
    
    def get_neighbors(self):
        self.neighbors.keys()
                   
class Tree:
    def __init__(self):
        self.participants = {}
    
    def __iter__(self):
        return iter(self.participants.values())
    
    def add_participant(self, p): #the weight = limbLength and is initilised as 0
        new_participant=Participant(p)
        self.participants[p]=new_participant
        return new_participant
        
    def add_limb(self, p1, p2, cost):
        if p1 not in self.participants:
            self.add_participant(p1)
        if p2 not in self.participants:
            self.add_participant(p2)
        self.participants[p1].add_neighbor(self.participants[p2], cost)
        self.participants[p2].add_neighbor(self.participants[p1], cost)
        
    def get_participant(self, p):
        if p in self.participants:
            return self.participants[p]
        else: 
            return None
        
    def get_all_participants(self):
        return list(self.participants.keys())
    
    def get_limb(self, p1, p2):
        if p1 and p2 in self.participants:
            self.participants[p1].get_weight(self.participants[p2])  
        else:
            return None

In [None]:
def totalDistance(S):
    D=S.copy()
    D.to_numpy()[tuple([np.arange(D.shape[0])]*2)]=0
    return D.sum()

def get_D_star(D, n):
    D_star = D.copy()
    totalDistances=totalDistance(D)
    D_star *= (n-2)
    for i in D:
        start_time = time.time()
        D_star.loc[i,:] -= totalDistances.loc[i]
        D_star.loc[:,i] -= totalDistances.loc[i]
    D_star.to_numpy()[tuple([np.arange(D_star.shape[0])]*2)]=np.inf
    return D_star

def get_minimum_node(D):
    min_id_1=D.min().idxmin()
    min_id_2=D.loc[min_id_1,:].idxmin()
    #min_value=D.loc[min_id_1, min_id_2]
    return min_id_1, min_id_2

In [None]:
def get_tree_order(tree, iterative=True):
    if iterative:
        name=list(tree.participants.keys())[-1]
    else:
        name=list(tree.participants.keys())[0]
    s=tree.participants[name]
    visited = set() # Set to keep track of visited nodes.

    def dfs(visited, graph, node):
        nodes=[]
        if node not in visited:
            if len(node.neighbors) == 1:
                nodes.append(node.id)
            visited.add(node)
            for n in node.neighbors:
                rest=dfs(visited, graph, n)
                nodes.extend(rest)
                #print('|', rest)
        return nodes

    order = dfs(visited, tree, s)
    return order

In [None]:
def neighborJoining_iter(S, n):
    limbs_to_add=[]
    T = Tree()
    while True:
        D = S.copy()
        for p in D:
            T.add_participant(p)
        if n==2:
            ps=T.get_all_participants()
            print("Made it through!!")
            T.add_limb(ps[-2],ps[-1],D.loc[ps[-2],ps[-1]])
            break
        D_ = get_D_star(D, n)
        i, j = get_minimum_node(D_)
        totalDistances=totalDistance(D)
        delta = (totalDistances.loc[i]-totalDistances.loc[j])/(n-2)
        limbLength_i=(D.loc[i,j]+delta)/2
        limbLength_j=(D.loc[i,j]-delta)/2
        m=str(i+','+j)
        D_k_m=[]
        for k in D:
            D_k_m.append((D.loc[k,i]+D.loc[k,j]-D.loc[i,j])/2)
        D.loc[m,:] = D_k_m
        D_k_m.append(np.inf)
        D.loc[:,m] = D_k_m
        D=D.drop([i,j], axis=0)
        D=D.drop([i,j], axis=1)
        limbs_to_add.append((i,m,limbLength_i))
        limbs_to_add.append((j,m,limbLength_j))
        S = D
        n-=1
    
    print("Time to add some limbs :)")
    for (v, w, cost) in limbs_to_add:
        T.add_limb(v, w, cost)
    return T

# Code

In [None]:
def load_data(filename):
    json_path = os.path.join('',filename)
    return pd.read_json(json_path)

original = load_data(input_data_path)
engagement = original.sample(frac=1, random_state=42).reset_index(drop=True)
encoding={}
engagement = set_up_features(engagement)

In [None]:
colour_cat = {'fromAVF':'orange', 'STOP':'red', 'NC':'gray', 'Normal':'green', 'Meta':'purple', 'Control':'purple', 'Other':'purple'}

In [None]:
colours=[]
for i in range(0, engagement.shape[0]):
    if engagement.loc[i, 'Direction'] == 'fromAVF':
        colours.append('fromAVF')
    else: #Direction = 'fromParticipant'
        label=engagement.loc[i, 'Label']
        if 'STOP' in label:
            colours.append('STOP')
        elif 'NC' in label:
            colours.append('NC')
        else:
            labelType=engagement.loc[i, 'LabelType']
            if len(labelType) == 1 or (len(labelType) == 5 and engagement.loc[i, 'Scheme'] == 'Location'):
                colours.append(labelType[0])
            else: 
                same=True
                start = labelType[0]
                for i in labelType:
                    if start != i:
                        same = False
                        break
                if same:
                    colours.append(labelType[0])
                else:
                    colours.append('Other') # 2 instances : 1 Meta Normal & 1 Normal Meta

c=pd.Series(colours)
for i in range(0, c.shape[0]):
    c.iloc[i]=str(c.iloc[i])
engagement['Colour']=c
#Merge Control and Meta into Other
#I want the Meta and Control cases to join the Other case
engagement['Colour']=engagement['Colour'].replace('Meta','Other')
engagement['Colour']=engagement['Colour'].replace('Control','Other')
#engagement['Colour'].value_counts()

In [None]:
'''plot_labels_scheme(engagement, 'Timestamp', 'ParticipantUUID', colour_cat)
plot_labels_scheme_only_participants(engagement, 'Timestamp', 'ParticipantUUID', colour_cat)
plot_labels_scheme_sep(engagement, 'Timestamp', 'ParticipantUUID', colour_cat)'''

In [None]:
df=engagement.copy()

print('Starting...')
start_time = time.time()
participant_seq, encoding['Colour']=get_paticient_seq(df, 'Colour')
print('Sequencing done : took', time.time() - start_time, 's to run')


In [None]:
#encoding['Colour'] - CHECK TO SEE IF CATERGORY ENCODING
encoding['Colour']

In [None]:
_time = time.time()
scores = average_weekly_score(df, participant_seq, category_to_score, method_to_score)
print(' scores done : took', time.time() - _time, 's to run')
    
_time = time.time()
dist_mx = distance_matrix(df, scores)
print(' dist mx done : took', time.time() - _time, 's to run')
    
_time = time.time()
tree = neighborJoining_iter(dist_mx, dist_mx.shape[0])
print(' tree done : took', time.time() - _time, 's to run')

_time = time.time()
tree_orders = get_tree_order(tree, False)
print(' tree ordering done : took', time.time() - _time, 's to run')

print('Done\n')

temp1 = reorder_participant(df, tree_orders)
temp1.to_csv(save_data_path)
print('Saved\n')


'''plot_labels_scheme(temp1, 'Timestamp', 'Order', colour_cat)
plot_labels_scheme_only_participants(temp1, 'Timestamp', 'Order', colour_cat)
plot_labels_scheme_sep(temp1, 'Timestamp', 'Order', colour_cat)'''

In [None]:
'''plot_labels_scheme(temp1, 'Timestamp', 'Order', colour_cat)
plot_labels_scheme_only_participants(temp1, 'Timestamp', 'Order', colour_cat)
plot_labels_scheme_sep(temp1, 'Timestamp', 'Order', colour_cat)'''