# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import copy
import time
import itertools
import scipy

# Load Dataset

In [None]:
original_watertreatment=pd.read_csv('data files/watertreatment-orig/water-treatment.data', low_memory=False)
pd.set_option('display.float_format', lambda x:'%f'%x)
original_watertreatment.columns=['Date', 'Q-E', 'ZN-E', 'PH-E', 'DBO-E', 'DQO-E', 'SS-E', 'SSV-E', 'SED-E',
                                 'COND-E', 'PH-P', 'DBO-P', 'SS-P', 'SSV-P', 'SED-P', 'COND-P', 'PH-D', 'DBO-D',
                                 'DQO-D', 'SS-D', 'SSV-D', 'SED-D', 'COND-D', 'PH-S', 'DBO-S', 'DQO-S', 'SS-S',
                                 'SSV-S', 'SED-S', 'COND-S', 'RD-DBO-P', 'RD-SS-P', 'RD-SED-P', 'RD-DBO-S',
                                 'RD-DQO-S', 'RD-DBO-G',  'RD-DQO-G',  'RD-SS-G',  'RD-SED-G']

In [None]:
original_watertreatment['Date']=original_watertreatment['Date'].str[2:]
original_watertreatment['Date']=pd.to_datetime(original_watertreatment['Date'])
original_watertreatment=original_watertreatment.set_index(pd.DatetimeIndex(original_watertreatment['Date']))

# Function Definitions

In [None]:
def normalise_dataframe(dataframe):
    x=copy.deepcopy(dataframe)
    min_max_scaler=preprocessing.MinMaxScaler()
    x_scaled=min_max_scaler.fit_transform(x)
    normalised_dataframe=pd.DataFrame(x_scaled, columns=dataframe.columns, index=dataframe.index)
    return normalised_dataframe

# Algorithm Function Definitions

In [None]:
def kmeans_clustering_initial(k=1):
    # Initial points
    centroids={}
    centroids_step = {
    i+1: [np.random.random(), np.random.random()]
    for i in range(k)
    }
    centroids[0]=centroids_step
    return centroids

In [None]:
def euclidean_distance(x, cx, y, cy):
    distance=np.sqrt((x-cx)**2 + (y-cy)**2)
    return distance

In [None]:
def deviation(x, cx, y, cy):
    deviation=(x-cx)+(y-cy)
    return deviation

In [None]:
def kmeans_clustering_assignment(dataframe, x, y, centroids, k, i):
    assignment=copy.deepcopy(dataframe)
    for ik in range(1, k+1):
        assignment['distance_from_{}'.format(ik)]=euclidean_distance(dataframe[x], centroids[i][ik][0], dataframe[y], centroids[i][ik][0])
        assignment['deviation_from_{}'.format(ik)]=deviation(dataframe[x], centroids[i][ik][0], dataframe[y], centroids[i][ik][0])
    centroid_distance_cols=['distance_from_{}'.format(ik) for ik in centroids[i].keys()]
    assignment['closest']=assignment.loc[:, centroid_distance_cols].idxmin(axis=1)
    assignment['closest']=assignment['closest'].map(lambda x: int(x.lstrip('distance_from_')))
    return assignment

In [None]:
def kmeans_clustering_update_centroids(dataset, x, y, k, i, centroids):
    centroids[i]=copy.deepcopy(centroids[i-1])
    for ik in range(1, k+1):
        centroids[i][ik]=[np.mean(dataset[dataset['closest']==ik][x]), np.mean(dataset[dataset['closest']==ik][y])]
    return centroids

In [None]:
def within_cluster_sum_of_square_errors(dataset, k, i):
    results={}
    wss=0.0
    for ik in range(1, k+1):
        temp=0.0
        cluster=dataset['closest']==ik
        cluster_data=dataset[cluster]['deviation_from_{}'.format(ik)].replace(np.nan, 0.0)
        count=cluster_data.count()
        cluster_data_2=np.power(cluster_data, 2)
        cluster_sum=cluster_data_2.sum()
        if cluster_sum==0.0:
            temp=0.0
        else:
            temp=cluster_sum/count
        wss+=temp
    return wss

In [None]:
def no_centroid_change(centroids, k, i):
    result=False
    for ik in range(1, k+1):
        x=centroids[i][ik][0]==centroids[i-1][ik][0]
        y=centroids[i][ik][1]==centroids[i-1][ik][1]
        if x==False or y==False:
            result=True
    return result

In [None]:
def kmeans_clustering(dataframe, x, y, k=1):
    kmeans=copy.deepcopy(dataframe)
    centroids=kmeans_clustering_initial(k)
    kmeans=kmeans_clustering_assignment(kmeans, x, y, centroids, k, 0)
    i=0
    while True:
        i+=1
        centroids = kmeans_clustering_update_centroids(kmeans, x, y, k, i, centroids)
        kmeans = kmeans_clustering_assignment(kmeans, x, y, centroids, k, i)
        if no_centroid_change(centroids, k, i):
            break;
    return kmeans, within_cluster_sum_of_square_errors(kmeans, k, i)

In [None]:
def do_kmeans(dataframe, x, y):
    plots={}
    seeds={}
    count=0
    total_plot_time=0
    np.random.seed(12345)
    for j in range(0, 25):
        seeds[j]=int(np.random.random()*1000)
    for i in range(1, 11):
        j_plots={}
        for j in range(0, 25):
            start = time.time()
            np.random.seed(seeds[j])
            j_plot, j_wss=kmeans_clustering(dataframe, x, y, i)
            j_plots[j_wss]=j_plot
            end = time.time()
            elapsed = end - start
            total_plot_time+=elapsed
            count+=1
            #print("Plot {} - time elapsed - {} - total time elapsed - {} - wss - {}".format(count, elapsed, total_plot_time, j_wss))
        wss = sorted(j_plots)[0]
        #print(wss)
        plot = j_plots[wss]
        plots[wss] = plot
    #print("Total time elapsed - {}".format(total_plot_time))
    return plots[sorted(plots)[0]], total_plot_time

# Graphs

## Calculate Column Combinations

In [None]:
column_combinations=[]
column_combinations=list(itertools.combinations(original_watertreatment.columns.drop('Date'), 2))

## Correlate Coefficients to Rule Out Unneeded Combinations

In [None]:
plots_to_calculate=[]
column_combos_count={}
for dataset_key, dataset in dataset_combinations.items():
    for combo in column_combinations:
        x=dataset[combo[0]]
        y=dataset[combo[1]]
        # result[0] is a value between -1 and 1
        # The null hypothesis is that the two columns are not correlated
        # The result is a number between 0 and one that represents the probability
        # that the data would have arisen if the null hypothesis is true
        result=scipy.stats.kendalltau(x, y)[0]
        x_r=result>0.9
        if(x_r):
            plots_to_calculate.append((dataset_key, dataset, combo[0], combo[1]))
            if '{}-{}'.format(combo[0], combo[1]) in column_combos_count:
                column_combos_count['{}-{}'.format(combo[0], combo[1])]+=1
            else:
                column_combos_count['{}-{}'.format(combo[0], combo[1])]=1
            #print('{}-{}-{}'.format(dataset_key, combo[0], combo[1]))
print(len(plots_to_calculate))
for key, value in column_combos_count.items():
    print('{}-{}'.format(key, value))

## Calculate Plots

In [None]:
plots={}
total_time=0
num=len(plots_to_calculate)
num_left=num
time_left_estimate=total_time
for entry in plots_to_calculate:
    dataset_key=entry[0]
    dataset=entry[1]
    x=entry[2]
    y=entry[3]
    plots['{}_{}_{}'.format(dataset_key, x, y)], plot_time=do_kmeans(dataset, x, y)
    total_time+=plot_time
    num_left-=1
    average_completion_time=total_time/(num-num_left)
    time_left_estimate=(num_left*(average_completion_time))
    print('Completed {}/{} plots. Average plot completion time is {}. Estimated time remaining is {}'.format((num-num_left), num, average_completion_time, time_left_estimate))

## Display Graphs

In [None]:
##hs_hmax_dir_tp_months_sst_months_plot, wss1=do_kmeans(normalised_wavedata_dir_tp_months_sst_months, 'Tz', 'SST')
##sns.lmplot('Tz', 'SST', data=hs_hmax_dir_tp_months_sst_months_plot, fit_reg=False, hue='closest')

In [None]:
##hs_tz_dir_tp_months_sst_months_plot, wss2=do_kmeans(normalised_wavedata_dir_tp_months_sst_months, 'Hs', 'Tz')
##sns.lmplot('Hs', 'Tz', data=hs_tz_dir_tp_months_sst_months_plot, fit_reg=False, hue='closest')

In [None]:
##hs_tp_dir_tp_months_sst_months_plot, wss3=do_kmeans(normalised_wavedata_dir_tp_months_sst_months, 'Hs', 'Tp')
##sns.lmplot('Hs', 'Tp', data=hs_tp_dir_tp_months_sst_months_plot, fit_reg=False, hue='closest')

In [None]:
##hs_dir_tp_dir_tp_months_sst_months_plot, wss4=do_kmeans(normalised_wavedata_dir_tp_months_sst_months, 'Hs', 'Dir_Tp TRUE')
##sns.lmplot('Hs', 'Dir_Tp TRUE', data=hs_dir_tp_dir_tp_months_sst_months_plot, fit_reg=False, hue='closest')

In [None]:
##hs_sst_dir_tp_months_sst_months_plot, wss5=do_kmeans(normalised_wavedata_dir_tp_months_sst_months, 'Hs', 'SST')
##sns.lmplot('Hs', 'SST', data=hs_sst_dir_tp_months_sst_months_plot, fit_reg=False, hue='closest')