In [None]:
%reload_ext autoreload
%autoreload 2
import ast
import pickle
import itertools
from collections import Counter
from tqdm import tqdm
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import cv2
import swifter

from analysis.generate_cluster_information_file import load, extract_all_information_query, to_df_query
from baseline.image_processing import pixel_intensity_histogram

In [None]:
df = load()
df.head(5)

In [None]:
# pd saves lists as strings, so we need to convert them to lists manually
df.cluster_sizes = df.cluster_sizes.swifter.apply(ast.literal_eval)
df.cluster_centers = df.cluster_centers.swifter.apply(ast.literal_eval)
df.cluster_peak_intensities = df.cluster_peak_intensities.swifter.apply(ast.literal_eval)
df.cluster_num_intensities = df.cluster_num_intensities.swifter.apply(ast.literal_eval)

assert type(df.cluster_sizes.tolist()[0]) == list

df.head(10)

In [None]:
df.plot(x='score', y='background_threshold', style='o')

In [None]:
df.plot(x='score', y='cluster_num', style='o')

In [None]:
cluster_sizes_avg = np.empty((9600))
cluster_peak_intensities_avg = np.empty((9600))
cluster_num_intensities_avg = np.empty((9600))

for i in tqdm(range(len(df.index))):
    c_s = df.iloc[i,3]
    c_p_i = df.iloc[i,4]
    c_n_i = df.iloc[i,5]
    cluster_sizes_avg[i] = np.average(np.array(c_s))
    cluster_peak_intensities_avg[i] = np.average(np.array(c_p_i))
    cluster_num_intensities_avg[i] = np.average(np.array(c_n_i))

In [None]:
df_avg = df
df_avg['cluster_sizes_avg'] = cluster_sizes_avg
df_avg['cluster_peak_intensities_avg'] = cluster_peak_intensities_avg
df_avg['cluster_num_intensities_avg'] = cluster_num_intensities_avg
df_avg.head()

In [None]:
df.plot(x='score', y='cluster_num_intensities_avg', style='o')

In [None]:
df.plot(x='score', y='cluster_num', style='o')

In [None]:
df.plot(x='score', y='cluster_peak_intensities_avg', style='o')

In [None]:
df.plot(x='score', y='cluster_sizes_avg', style='o')

In [None]:
scores = []
cluster_x = []
cluster_y = []

for i in tqdm(range(len(df.index))):
    score = df.iloc[i,0]
    cluster_num = df.iloc[i,2]
    cluster_centers = df.iloc[i,6]
    for j in range(cluster_num):
        scores.append(score)
        (x,y) = cluster_centers[j]
        cluster_x.append(x)
        cluster_y.append(y)

In [None]:
scores_sample = []
cluster_x_samples = []
cluster_y_samples = []

for i in tqdm(range(0,len(scores), 100000)):
    scores_sample.append(scores[i])
    cluster_x_samples.append(cluster_x[i])
    cluster_y_samples.append(cluster_y[i])

np.array(scores_sample).shape

In [None]:
scores_avg = np.empty(len(df.index))
cluster_x_avg = np.empty(len(df.index))
cluster_y_avg = np.empty(len(df.index))

for i in tqdm(range(len(df.index))):
    scores_avg[i] = df.iloc[i,0]
    cluster_num = df.iloc[i,2]
    cluster_centers = df.iloc[i,6]
    temp_x = np.empty(cluster_num)
    temp_y = np.empty(cluster_num)
    for j in range(cluster_num):
        (x,y) = cluster_centers[j]
        temp_x[j] = x 
        temp_y[j] = y
    cluster_x_avg[i] = np.average(temp_x)
    cluster_y_avg[i] = np.average(temp_y)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(cluster_x_avg, cluster_y_avg, scores_avg)
ax.set_xlabel('cluster_x')
ax.set_ylabel('cluster_y')
ax.set_zlabel('scores')
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(cluster_x_samples, cluster_y_samples, scores_sample)
ax.set_xlabel('cluster_x')
ax.set_ylabel('cluster_y')
ax.set_zlabel('scores')
plt.show()

In [None]:
# [cluster_num, cluster_num_intensities_avg, cluster_peak_intensities_avg, cluster_x_avg, cluster_y_avg]

train_X = np.empty((len(df.index),5))
train_y = scores_avg

for i in tqdm(range(len(df.index))):
    train_X[i,0] = df.iloc[i,2]
    train_X[i, 1] = cluster_num_intensities_avg[i]
    train_X[i, 2] = cluster_peak_intensities_avg[i]
    train_X[i, 3] = cluster_x_avg[i]
    train_X[i, 4] = cluster_y_avg[i]

In [None]:
df_X = pd.DataFrame(columns=['cluster_num', 'cluster_num_intensities_avg', 'cluster_peak_intensities_avg', 'cluster_x_avg', 'cluster_y_avg'])
df_y = pd.DataFrame(columns=['score'])
df_X['cluster_num'] = train_X[:,0]
df_X['cluster_num_intensities_avg'] = train_X[:,1]
df_X['cluster_peak_intensities_avg'] = train_X[:,2]
df_X['cluster_x_avg'] = train_X[:,3]
df_X['cluster_y_avg'] = train_X[:,4]
df_y['score'] = scores_avg

df_X.to_csv('train_X.csv', index=False)
df_y.to_csv('train_y.csv', index=False)

**Preparing data for query**

In [None]:
test_X = extract_all_information_query(os.path.join('data','query'))

In [None]:
test_df = to_df_query(test_X)
test_df.to_csv('test_df.csv')
test_df.head()