Name: Swapnanil Halder

Roll No.: 18MA20046

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA

In [None]:
data = pd.read_csv(r"./Twitter_data.csv", header=0, low_memory=False)
data.tail(3)

In [None]:
data.dropna(axis=0, how='any', inplace=True)
data.reset_index(drop=True, inplace=True)
data.tail(3)

In [None]:
data.drop(labels=[' Lang', ' IsReshare', ' Likes', ' text', ' UserID'], axis=1, inplace=True)
data.tail(3)

In [None]:
data['TweetID'] = pd.Series([int(each[3:]) for each in data['TweetID'].tolist()], name='TweetID')
data.tail(3)

In [None]:
data.groupby('Day').count()

In [None]:
data_X, data_Y = data.loc[:, ~data.columns.isin(['Day'])], data.loc[:, ['Day']]
X, Y = data_X.to_numpy(), data_Y.to_numpy().reshape(-1)

In [None]:
from sklearn.preprocessing import MinMaxScaler as scale
X[:,:] = scale().fit_transform(X[:,:])

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

def plot_dendrogram(Type, threshold):
    plt.figure(figsize=(15,8))
    dn = dendrogram(linkage(X, Type), leaf_rotation=45, leaf_font_size=2,
                    above_threshold_color="#eeeeee", color_threshold=threshold, orientation='right')
    plt.axvline(x=threshold, c='grey', linestyle='dashed')
    plt.show()


In [None]:
plot_dendrogram(Type='single', threshold=0.32)

In [None]:
plot_dendrogram(Type='complete', threshold=1.32)

In [None]:
from tqdm.notebook import tqdm
from collections import Counter

In [None]:
def euclidean_distance(i, j):
    if i > j:
        # lower triangle
        x1, x2 = X[i], X[j]
        result = 0.0
        for k in range(len(x1)):
            result += np.power(x1[k] - x2[k], 2)
        return np.sqrt(result)
    else:
        # upper triangle and diagonal
        return 0.0

def most_similar(df):
    Dict = {'val': 1e9, 'i': -1, 'j': -1}
    for i in range(len(df)):
        for j in range(i):
            if Dict['val'] > df.iloc[i,j]:
                Dict['val'] = df.iloc[i,j]
                Dict['i'], Dict['j'] = i, j
    return Dict['val'], Dict['i'], Dict['j']


In [None]:
def get_the_clusters_by_threshold(X, Type):
    if Type == 'single': threshold=0.32
    elif Type == 'complete': threshold=1.32
    else: raise Exception('use proper type')
    
    len_X = len(X)
    dist = pd.DataFrame([[euclidean_distance(i, j) for j in range(len_X)] for i in range(len_X)],
                        columns=[str(i+1) for i in range(len_X)], index=[str(i+1) for i in range(len_X)])
    
    for i in tqdm(range(len(dist)-1)):
        d, i, j = most_similar(dist)
        if d > threshold: break
        
        label = dist.columns[j]+','+dist.columns[i]
        if Type == 'single':
            clustered_ij = pd.DataFrame([[min(dist.iloc[max(i,k),min(i,k)], dist.iloc[max(j,k),min(j,k)]) for k in range(len(dist)) if k not in [i, j]]])
        else:
            clustered_ij = pd.DataFrame([[max(dist.iloc[max(i,k),min(i,k)], dist.iloc[max(j,k),min(j,k)]) for k in range(len(dist)) if k not in [i, j]]])

        dist.drop(dist.index[[i,j]], axis=0, inplace=True)
        dist.drop(dist.columns[[i,j]], axis=1, inplace=True)

        clustered_ij.index, clustered_ij.columns = [label], dist.columns
        dist = pd.concat([dist, clustered_ij], axis=0)
        dist[label]=0.0
    
    return [[int(idx)-1 for idx in each.split(',')] for each in dist.index.tolist()]


In [None]:
clusters = get_the_clusters_by_threshold(X, 'single')
clusters_type = [Counter([Y[each] for each in clusters[i]]) for i in range(len(clusters))]

In [None]:
Dict = {'None': 0, 'Friday': 0, 'Sunday': 0, 'Saturday': 0, 'Thursday': 0}
cluster_day = {}
cluster_no = ['None'] * len(X)
for i, each_cluster in enumerate(clusters):
    cluster_name = 'cluster_'+str(i+1)
    for each in each_cluster:
        cluster_no[each] = cluster_name
    most_occured, most_value = clusters_type[i].most_common(1)[0]
    if Dict[most_occured] < most_value:
        Dict['None'] += Dict[most_occured]
        Dict[most_occured] = most_value
        cluster_day[most_occured] = cluster_name
    else:
        Dict['None'] += most_value
cluster_day = dict(zip(cluster_day.values(),cluster_day.keys()))
print(Dict)
print('Accuracy =', np.round(100 - 100*Dict['None']/sum(Dict.values()), 3), '%')

In [None]:
data_visualize = pd.DataFrame(PCA(2).fit_transform(X), columns=['X','Y'])
data_visualize['hue'] = [cluster_day[each] if each in cluster_day.keys() else 'None' for each in cluster_no]
sns.relplot(data=data_visualize, x='X', y='Y', hue='hue', palette="muted")
plt.show()

In [None]:
clusters = get_the_clusters_by_threshold(X, 'complete')
clusters_type = [Counter([Y[each] for each in clusters[i]]) for i in range(len(clusters))]

In [None]:
Dict = {'None': 0, 'Friday': 0, 'Sunday': 0, 'Saturday': 0, 'Thursday': 0}
cluster_day = {}
cluster_no = ['None'] * len(X)
for i, each_cluster in enumerate(clusters):
    cluster_name = 'cluster_'+str(i+1)
    for each in each_cluster:
        cluster_no[each] = cluster_name
    most_occured, most_value = clusters_type[i].most_common(1)[0]
    if Dict[most_occured] < most_value:
        Dict['None'] += Dict[most_occured]
        Dict[most_occured] = most_value
        cluster_day[most_occured] = cluster_name
    else:
        Dict['None'] += most_value
cluster_day = dict(zip(cluster_day.values(),cluster_day.keys()))
print(Dict)
print('Accuracy =', np.round(100 - 100*Dict['None']/sum(Dict.values()), 3), '%')

In [None]:
data_visualize = pd.DataFrame(PCA(2).fit_transform(X), columns=['X','Y'])
data_visualize['hue'] = [cluster_day[each] if each in cluster_day.keys() else 'None' for each in cluster_no]
sns.relplot(data=data_visualize, x='X', y='Y', hue='hue', palette="muted")
plt.show()