# Visualize COCO features

1. visualize coco features
2. identify pca-one; what is its cosine similarity with the residual (should be very high)
3. move along the direction, plot 1-dim loss landscape. [-2,-1,-0.5,0,0.5,1,2]
    - need to have a fn(scalar,), output loss. 


In [None]:
import argparse
import os
import random
import shutil
import time
import warnings
from enum import Enum
import pickle
import numpy as np
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim
from torch.utils.data import Dataset, DataLoader
import torch.backends.cudnn as cudnn

import glob 
def my_norm(x):
    return x/np.linalg.norm(x, axis=-1, keepdims=True)

In [None]:
data_dict_list = list()

for pickle_path in glob.glob('./features*/feature_dump_*.pkl'):
    with open(pickle_path, 'rb') as pkl_file:
        data_dict = pickle.load(pkl_file)
        assert len(data_dict['clip_image_features_list']) == len(data_dict['clip_text_features_list'])
        # assert len(data_dict['clip_image_features_list']) == len(data_dict['target_image_features_list'])
        # print('Number of image-text pairs', len(data_dict['clip_image_features_list']))
        data_dict_list.append(data_dict)

print('Number of experiment files loaded', len(data_dict_list))

In [None]:
# visualize.

from sklearn.decomposition import PCA
# from sklearn.decomposition import TruncatedSVD as PCA # showns as multiple lines. 
# from sklearn.manifold import TSNE as PCA # 
# import umap
# from umap import UMAP as PCA
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# sns.set(font_scale=2)  # crazy big
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.set_theme()


In [None]:
# Functionality: given a list of exp, plot one modality. 
sns.set_context("talk", font_scale=1.5) # paper, notebook, talk, and poster; font_scale=1.5,

def plot_scattered_cones(data_dict_list, modality_str, draw=True):
    assert modality_str in ['clip_image_features_list', 'clip_text_features_list', 'target_image_features_list']
    print('modality_str: ', modality_str)
    # dataset_size = len(data_dict_list[0][modality_str])
    dataset_size = 5000

    total_feature_list = list()
    label_list = list()
    for expriment_idx in range(len(data_dict_list)):
        total_feature_list.append(data_dict_list[expriment_idx][modality_str][:dataset_size])
        label_list.extend(['Random-{}'.format(expriment_idx+1)] * dataset_size)
    total_feature_np = np.concatenate(total_feature_list, axis=0) 
    total_feature_np = my_norm(total_feature_np) # L2-normalize
    assert len(total_feature_np) == len(data_dict_list) * dataset_size

    pca = PCA(n_components=6)
    pca_result = pca.fit_transform(total_feature_np)
    print('pca.explained_variance_ratio_', pca.explained_variance_ratio_)
    print('pca.singular_values_', pca.singular_values_)

    df = pd.DataFrame()
    df['pca_one'] = pca_result[:,0]
    df['pca_two'] = pca_result[:,1] 
    df['Random Seed'] = label_list

    if draw:
        plt.figure(figsize=(20.0,6.18 * 2))
        p1 = sns.scatterplot(
            x="pca_one", y="pca_two",
            hue="Random Seed",
            data=df,
            legend=True,
        )
        plt.xlabel("")
        plt.ylabel("")
        plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), prop={'size': 18})
        plt.show()

    return df


In [None]:
df_clip_img = plot_scattered_cones(data_dict_list[:25], 'clip_image_features_list',   draw=True)
df_clip_txt = plot_scattered_cones(data_dict_list[:25], 'clip_text_features_list',    draw=True)
df_resnet   = plot_scattered_cones(data_dict_list[:25], 'target_image_features_list', draw=True)


In [None]:
def draw_df(df):
    plt.figure(figsize=(20.0,6.18 * 2))
    df['Seed'] = df['Random Seed'].str.replace('Random-', '', regex=False)
    p1 = sns.scatterplot(
        x="pca_one", y="pca_two",
        hue="Seed",
        data=df,
        legend=True,
    )
    plt.xlabel("")
    plt.ylabel("")
    plt.legend(title='Random Seed', loc='upper left', bbox_to_anchor=(1.00, 1.0, ), ncol=2) # prop={'size': 50}, 
    plt.show()
    return

draw_df(df_clip_img)

# Plot PCA Singular Values, Explained Variance Ratios. 
Kind of anwering Mert's question

In [26]:
# Functionality: given a list of exp, plot one modality. 
sns.set_context("talk", font_scale=1.5) # paper, notebook, talk, and poster; font_scale=1.5,

def plot_pca_stats(data_dict_list, modality_str, draw=True):
    assert modality_str in ['clip_image_features_list', 'clip_text_features_list', 'target_image_features_list']
    print('modality_str: ', modality_str)
    # dataset_size = len(data_dict_list[0][modality_str])
    dataset_size = 5000

    total_feature_list = list()
    label_list = list()
    for expriment_idx in range(len(data_dict_list)):
        total_feature_list.append(data_dict_list[expriment_idx][modality_str][:dataset_size])
        label_list.extend(['Random-{}'.format(expriment_idx+1)] * dataset_size)
    total_feature_np = np.concatenate(total_feature_list, axis=0) 
    total_feature_np = my_norm(total_feature_np) # L2-normalize
    assert len(total_feature_np) == len(data_dict_list) * dataset_size

    pca = PCA(n_components=50)
    pca_result = pca.fit_transform(total_feature_np)
    print('pca.explained_variance_ratio_')
    for ratio in pca.explained_variance_ratio_[:10]:
        print('{:.3f},'.format(ratio), end=' ')
    print()


    print('pca.singular_values_', pca.singular_values_)
    return


df_clip_img = plot_pca_stats(data_dict_list[:25], 'clip_image_features_list',   draw=True)
df_clip_txt = plot_pca_stats(data_dict_list[:25], 'clip_text_features_list',    draw=True)
df_resnet   = plot_pca_stats(data_dict_list[:25], 'target_image_features_list', draw=True)


modality_str:  clip_image_features_list
pca.explained_variance_ratio_
0.043, 0.041, 0.039, 0.038, 0.036, 0.035, 0.035, 0.034, 0.033, 0.032, 
pca.singular_values_ [72.44832  70.31703  68.78217  68.24517  66.22955  65.66144  65.02128
 64.06602  63.149437 62.50923  61.43108  60.71535  60.435135 59.02705
 58.74808  57.4058   56.325825 56.2117   55.202732 54.309063 53.766792
 52.040756 51.68926  49.76612  34.14688  33.398888 32.901985 31.960554
 31.528515 31.300081 30.672626 30.518982 30.29744  29.762638 29.396282
 28.373528 28.064127 27.74946  27.346584 27.130186 26.959745 26.397924
 25.524904 25.109116 24.717733 24.531994 24.060846 23.81253  22.803596
 20.144312]
modality_str:  clip_text_features_list
pca.explained_variance_ratio_
0.043, 0.041, 0.039, 0.037, 0.037, 0.035, 0.034, 0.033, 0.033, 0.031, 
pca.singular_values_ [71.93895  70.64999  68.51955  67.25281  66.71326  65.2795   64.50423
 63.39669  62.925117 61.176167 59.73097  58.7134   58.423645 57.11752
 56.474472 55.85696  54.98844 