In [None]:
#Importing useful libraries

from __future__ import print_function
import time
import numpy as np
import pandas as pd
#from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# Reading the dataset

#file='NERSCHourlyBucketGatherStDevClean.csv'
#file='ALCFHourlyBucketGatherStDevClean-complete.csv'
file='OLCF-good-new-w.csv'
dataframeloaded = pd.read_csv(file)

# Filtering out Training vs. Test dataset

train_months = ['January','February']
training_data = dataframeloaded[dataframeloaded.date_month.isin(train_months)]

test_months = ['March', 'April','May']
test_data = dataframeloaded[dataframeloaded.date_month.isin(test_months)]


#For OLCF
y_test = test_data['date_wday'].replace({"Monday": "1", 
                                      "Tuesday": "2", 
                                      "Wednesday": "3", 
                                      "Thursday": "4", 
                                      "Friday": "5", 
                                      "Saturday": "6",
                                      "Sunday": "7"})

#For ALCF/NERSC
#y_test = test_data['date_wday'].replace({"monday": "1", 
#                                      "tuesday": "2", 
#                                      "wednesday": "3", 
#                                      "thursday": "4", 
#                                      "friday": "5", 
#                                      "saturday": "6",
#                                      "sunday": "7"})

y_test = pd.to_numeric(y_test)

#For ALCF/NERSC
#dataframe_clean_test = test_data.drop(columns=['_raw','_time','date_mday','date_minute','date_month', 'date_second',
#    'date_wday','date_year','eventtype','host','index','info_max_time','info_min_time','info_search_time',
#    'linecount','search_name','search_now','source','sourcetype','splunk_server','splunk_server_group',
#    'tag','tag::eventtype','timeendpos','timestartpos'])

#For ALCF/NERSC
#dataframe_train = training_data.drop(columns=['_raw','_time','date_mday','date_minute','date_month', 'date_second',
#    'date_year','eventtype','host','index','info_max_time','info_min_time','info_search_time',
#    'linecount','search_name','search_now','source','sourcetype','splunk_server','splunk_server_group',
#    'tag','tag::eventtype','timeendpos','timestartpos'])

#For OLCF
dataframe_clean_test = test_data.drop(columns=[' info_search_time','date_wday','date_month','date_mday','date_year',
                                               'date_minute','date_second','date_zone','search_name', 'search_now', 
                                               'info_min_time','info_max_time', 'info_search_time', '_time', 
                                               'host', 'index', 'info_search_time', 'linecount', 'source', 
                                               'sourcetype', 'splunk_server'])

#For OLCF
dataframe_train = training_data.drop(columns=[' info_search_time','date_month','date_mday','date_year',
                                               'date_minute','date_second','date_zone','search_name', 'search_now', 
                                               'info_min_time','info_max_time', 'info_search_time', '_time', 
                                               'host', 'index', 'info_search_time', 'linecount', 'source', 
                                               'sourcetype', 'splunk_server'])



dataframe_train.dropna(inplace=True)

#For OLCF
y_train = dataframe_train['date_wday'].replace({"Monday": "1", 
                                      "Tuesday": "2", 
                                      "Wednesday": "3", 
                                      "Thursday": "4", 
                                      "Friday": "5", 
                                      "Saturday": "6",
                                      "Sunday": "7"})

#For ALCF/NERSC
#y_train = dataframe_train['date_wday'].replace({"monday": "1", 
#                                      "tuesday": "2", 
#                                      "wednesday": "3", 
#                                      "thursday": "4", 
#                                      "friday": "5", 
#                                      "saturday": "6",
#                                      "sunday": "7"})

y_train = pd.to_numeric(y_train)

dataframe_clean_train = dataframe_train.drop(columns=['date_wday'])

print(dataframe_clean_train.shape, y_train.shape, dataframe_clean_test.shape,  y_test.shape)

### Comparison of PCA vs t-SNE 2D ###

def pca_tsne_2d(dataframe_clean, y): 

    res_col = ["date_wday", "pca-2d-one", "pca-2d-two", "tsne-2d-one", "tsne-2d-two"]
    results = pd.DataFrame(columns = res_col)
    results["date_wday"] = y

    # PCA implementation, 2D - Dimentionality reduced to 2

    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(dataframe_clean)

    results['pca-2d-one'] = pca_result[:,0]
    results['pca-2d-two'] = pca_result[:,1] 
    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

    # t-SNE implementation, 2D - Dimentionality reduced to 2

    time_start = time.time()

    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(dataframe_clean)

    results['tsne-2d-one'] = tsne_results[:,0]
    results['tsne-2d-two'] = tsne_results[:,1]
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

    # plotting PCA vs t-SNE

    plt.figure(figsize=(16,7))
    ax1 = plt.subplot(1, 2, 1)
    sns.scatterplot(
        x="pca-2d-one", y="pca-2d-two",
        hue="date_wday",
        palette=sns.color_palette("bright", 7),
        data=results,
        legend="full",
        alpha=0.3,
        ax=ax1
    )
    ax2 = plt.subplot(1, 2, 2)
    sns.scatterplot(
        x="tsne-2d-one", y="tsne-2d-two",
        hue="date_wday",
        palette=sns.color_palette("bright", 7),
        data=results,
        legend="full",
        alpha=0.3,
        ax=ax2
    )
### Comparison of PCA vs t-SNE 3D ###

def pca_tsne_3d(dataframe_clean, y):
    res3_col = ["date_wday", "pca-3d-one", "pca-3d-two","pca-3d-three", "tsne-3d-one", "tsne-3d-two", "tsne-3d-three"]
    results3 = pd.DataFrame(columns = res3_col)
    results3["date_wday"] = y
    
    # PCA implementation, 3D - Dimentionality reduced to 3

    pca3 = PCA(n_components=3)
    pca3_result = pca3.fit_transform(dataframe_clean)

    results3['pca-3d-one'] = pca3_result[:,0]
    results3['pca-3d-two'] = pca3_result[:,1] 
    results3['pca-3d-three'] = pca3_result[:,2] 
    print('Explained variation per principal component: {}'.format(pca3.explained_variance_ratio_))

    # t-SNE implementation, 3D - Dimentionality reduced to 3

    time_start = time.time()

    tsne3 = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
    tsne3_results = tsne3.fit_transform(dataframe_clean)

    results3['tsne-3d-one'] = tsne3_results[:,0]
    results3['tsne-3d-two'] = tsne3_results[:,1]
    results3['tsne-3d-three'] = tsne3_results[:,2]
    #print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

    # plotting PCA 3D
    fig = plt.figure(figsize=(18,7))
    
    ax = fig.add_subplot(1,2,1, projection='3d')
    scatter = ax.scatter(
        xs=results3['pca-3d-one'], 
        ys=results3['pca-3d-two'], 
        zs=results3['pca-3d-three'], 
        c=results3['date_wday'], 
        cmap='tab10'
    )

    ax.set_xlabel('pca-3d-one')
    ax.set_ylabel('pca-3d-two')
    ax.set_zlabel('pca-3d-three')
    legend1 = ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="date_wday")
    ax.add_artist(legend1)


    # plotting t-SNE 3D

    ax = fig.add_subplot(1,2,2, projection='3d')
    scatter = ax.scatter(
        xs=results3['tsne-3d-one'], 
        ys=results3['tsne-3d-two'], 
        zs=results3['tsne-3d-three'], 
        c=results3['date_wday'], 
        cmap='tab10'
    )
    legend2 = ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="date_wday")
    ax.add_artist(legend2)
 
    ax.set_xlabel('tsne-3d-one')
    ax.set_ylabel('tsne-3d-two')
    ax.set_zlabel('tsne-3d-three')
    ax.set_xlim([-10,10])
    ax.set_ylim([-10,10])
    ax.set_zlim([-10,10])
    
    ax.xaxis.set_ticks(np.arange(-10, 10, 3))
    ax.yaxis.set_ticks(np.arange(-10, 10, 3))
    ax.zaxis.set_ticks(np.arange(-10, 10, 3))
    
    ax.xaxis.set_tick_params(labelsize=14)
    ax.yaxis.set_tick_params(labelsize=14)
    ax.zaxis.set_tick_params(labelsize=14)
    plt.show()
    
# Reducing the Dimentiality using PCA and t-SNE to 2
pca_tsne_2d(dataframe_clean_train, y_train)
pca_tsne_2d(dataframe_clean_test, y_test)

# Reducing the Dimentiality using PCA and t-SNE to 3
pca_tsne_3d(dataframe_clean_train, y_train)
pca_tsne_3d(dataframe_clean_test, y_test)
