In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

KeyboardInterrupt: ignored

In [None]:
import glob
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm_notebook
import pickle
from time import sleep


In [None]:
def remove_dups(path: str) -> None:
  data_files = glob.glob(f'{path}/*')
  len(data_files)
  data_files_names = [file.split('/')[-1] for file in data_files]
  dup_files = {}
  for file in data_files_names:
    file_name = file[:file.find('.png')]
    if file_name in dup_files:
      dup_files.get(file_name).append(file)
    else:
      dup_files[file_name] = [file]

  for item in dup_files.items():
    if len(item[1])>1:
      print(f"File Name: {item[0]}\nFiles: {item[1]}")
      file2remove = item[1][0]
      print(f"Removing: {file2remove}\n\n")
      os.remove(f"{path}/{file2remove}")

In [None]:
len(glob.glob('gdrive/My Drive/ML Project/Kmeans Mini Batch (Standard Scaler)/*'))

2048

In [None]:
data_files = glob.glob('gdrive/My Drive/ML Project/Raw Data/Normalized Data/*.txt')

# Scaling

In [None]:
def fit_scaler_and_save(scaler, output_scaler_name):
  i = 0
  for file in tqdm_notebook(data_files):
      df = pd.read_csv(file,sep='\t')
      scaled_parameters = df.iloc[:,7:]
      scaler.partial_fit(scaled_parameters)
      i+=1
      if i%15 == 0:
        print(f'Saved {output_scaler_name} after {i} files.')
        with open(f'gdrive/My Drive/ML Project/{output_scaler_name}.pickle', 'wb') as pickled_scaler:
          pickle.dump(scaler, pickled_scaler)
  with open(f'gdrive/My Drive/ML Project/{output_scaler_name}.pickle', 'wb') as pickled_scaler:
    pickle.dump(scaler, pickled_scaler)
  
def transform_and_save(scaler, output_dir):
  for file in tqdm_notebook(data_files):
    df = pd.read_csv(file,'\t')
    last_dot = file.rfind('.')
    new_name = file[:last_dot]+" - Scaled"+file[last_dot:]
    new_name = new_name.replace('Normalized Data', output_dir)
    scaled_parameters = df.iloc[:,7:]
    final_df = pd.concat([df.iloc[:,:7],pd.DataFrame(scaler.transform(scaled_parameters), columns=list(scaled_parameters.columns))],axis=1)
    final_df.to_csv(new_name)

### Standard Scaler


In [None]:
scaler = StandardScaler()
fit_scaler_and_save(scaler, 'stadard_scaler')

In [None]:
# Load trained Scaler into memory
with open('gdrive/My Drive/ML Project/stadard_scaler.pickle', 'rb') as pickled_scaler:
    scaler = pickle.load(pickled_scaler)

In [None]:
transform_and_save(scaler, 'Normalized and scaled data')

### MinMaxScaler

In [None]:
scaler = MinMaxScaler()
fit_scaler_and_save(scaler, 'MinMax_scaler')

In [None]:
transform_and_save(scaler, 'Normalized and scaled data (MinMax Scaler)')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=2049.0), HTML(value='')))




# Mini Batch K Means

## Run MiniBatch K Means

In [None]:
from sklearn.cluster import MiniBatchKMeans
import numpy as np

FILES_IN_BATCH=3
BATCH_SIZE = 100
MAX_ITER = 10
MAX_CLUSTERS=10


In [None]:
def concat_files(file_list: list) -> pd.DataFrame:
  '''
  Given a list of indices in SCALED_FILES 
  return one concatenated DataFrame of all of the files
  '''
  li = []

  for file in file_list:
      df = pd.read_csv(file, index_col=None, header=0)
      li.append(df)

  return pd.concat(li, axis=0, ignore_index=True)

def get_new_df(num_of_files: int = FILES_IN_BATCH):
  actual_file_num = min(num_of_files, len(SCALED_FILES))
  file_list = np.random.choice(SCALED_FILES, actual_file_num)
  df = concat_files(file_list)
  for file in file_list:
    try:
      SCALED_FILES.remove(file)
    except Exception:
      print(file)
  return df

In [None]:
iteration_counter = 0
SCALED_FILES = glob.glob('gdrive/My Drive/ML Project/Normalized and scaled data (MinMax Scaler)/*')
kmeans_array = []
for k in range(2,MAX_CLUSTERS+1):
  kmeans_array.append(MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=BATCH_SIZE, max_iter=MAX_ITER))

In [None]:
### IF NEEDED TO LOAD BACK DATA:
# kmeans_array = []
# for k in range(2,MAX_CLUSTERS+1):
#   with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'rb') as pickled_kmeans:
#         kmeans_array.append(pickle.load(pickled_kmeans))

# with open('gdrive/My Drive/ML Project/files_left.pickle', 'rb') as files_left:
#   SCALED_FILES = pickle.load(files_left)

In [None]:
with tqdm_notebook(total=len(SCALED_FILES)) as pbar:
  while SCALED_FILES:
    df = get_new_df()
    features_only = df.iloc[:,8:]
    features_only.fillna(0, inplace=True)  # As this is Standard Scaller 0 is the mean for other scaller maybe dropna would be better.
    for kmeans in kmeans_array:
      kmeans.partial_fit(features_only)
    if iteration_counter%15 == 0:
      for k in range(2,MAX_CLUSTERS+1):
        with open(f'gdrive/My Drive/ML Project/Pickled data/minibatchkmeans_minmax_{k}.pickle', 'wb') as pickled_kmeans:
          pickle.dump(kmeans_array[k-2], pickled_kmeans)
      with open('gdrive/My Drive/ML Project/files_left.pickle', 'wb') as files_left:
        pickle.dump(SCALED_FILES, files_left)
    pbar.update(FILES_IN_BATCH)

    iteration_counter+=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=2049.0), HTML(value='')))

In [None]:
# Save kmeans
for k in range(2,MAX_CLUSTERS+1):
  with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'wb') as pickled_kmeans:
    pickle.dump(kmeans, pickled_kmeans)

## Apply K-Means

In [None]:
def scale_file(file:str):
  print("started")
  df = pd.read_csv(file)
  df_info = df.iloc[:,:8]
  df_parameters = df.iloc[:,8:].fillna(df.mean())
  for k in range(2,MAX_CLUSTERS+1):
    calssify_by_kmeans= kmeans_array[k-2].predict(df_parameters)
    df_info = pd.concat([df_info,pd.Series(calssify_by_kmeans, name=f'K={k}')],axis=1)
  name = file.split('/')[-1].replace('.txt', '.csv')
  df_info.to_csv(save_dir+'/'+name)
  print(f"saved {save_dir+'/'+name}")

In [None]:
# Load kmeans into memory
kmeans_array = []
for k in range(2,MAX_CLUSTERS+1):
  with open(f'gdrive/My Drive/ML Project/Pickled data/minibatchkmeans_minmax_{k}.pickle', 'rb') as pickled_kmeans:
        kmeans_array.append(pickle.load(pickled_kmeans))

SCALED_FILES = glob.glob('gdrive/My Drive/ML Project/Normalized and scaled data (MinMax Scaler)/*')
save_dir = 'gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)'
for file in tqdm_notebook(SCALED_FILES):
  df = pd.read_csv(file)
  df_info = df.iloc[:,:8]
  df_parameters = df.iloc[:,8:].fillna(df.mean())
  for k in range(2,MAX_CLUSTERS+1):
    calssify_by_kmeans= kmeans_array[k-2].predict(df_parameters)
    df_info = pd.concat([df_info,pd.Series(calssify_by_kmeans, name=f'K={k}')],axis=1)
  name = file.split('/')[-1].replace('.txt', '.csv')
  df_info.to_csv(save_dir+'/'+name)
  print(f"saved {save_dir+'/'+name}")
print('Saved All Files')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=2049.0), HTML(value='')))

saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region8.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region9.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region10.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region11.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region12.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax Scaler)/MB16CC004_6d236296-87b7-4316-9bbd-e24334a571e8.SCN - Series 1_Region13.png_HS.png - Scaled.csv
saved gdrive/My Drive/ML Project/Kme

From nuclei clustering extract imgae clustering

In [None]:
# Load kmeans into memory
kmeans_array = []
for k in range(2,MAX_CLUSTERS+1):
  with open(f'gdrive/My Drive/ML Project/Pickled data/minibatchkmeans_standard_{k}.pickle', 'rb') as pickled_kmeans:
        pickle.load(pickled_kmeans)
        kmeans_array.append(pickle.load(pickled_kmeans))

SCALED_FILES = glob.glob('gdrive/My Drive/ML Project/Normalized and scaled data (Standard Scaler)/*')
save_dir = 'gdrive/My Drive/ML Project/Kmeans Mini Batch (Standard Scaler)'
file = SCALED_FILES[0]
df = pd.read_csv(file)
df_info = df.iloc[:,:8]
df_parameters = df.iloc[:,8:].fillna(df.mean())
for k in range(2,MAX_CLUSTERS+1):
  calssify_by_kmeans= kmeans_array[k-2].predict(df_parameters)
  df_info = pd.concat([df_info,pd.Series(calssify_by_kmeans, name=f'K={k}')],axis=1)
name = file.split('/')[-1].replace('.txt', '.csv')
df_info.to_csv(save_dir+'/'+name)
print(f"saved {save_dir+'/'+name}")
print('Saved All Files')

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=10, max_no_improvement=10,
                n_clusters=10, n_init=3, random_state=0,
                reassignment_ratio=0.01, tol=0.0, verbose=0)


EOFError: ignored

In [None]:
kmeans_array

[MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                 init_size=None, max_iter=10, max_no_improvement=10,
                 n_clusters=2, n_init=3, random_state=0, reassignment_ratio=0.01,
                 tol=0.0, verbose=0),
 MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                 init_size=None, max_iter=10, max_no_improvement=10,
                 n_clusters=3, n_init=3, random_state=0, reassignment_ratio=0.01,
                 tol=0.0, verbose=0),
 MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                 init_size=None, max_iter=10, max_no_improvement=10,
                 n_clusters=4, n_init=3, random_state=0, reassignment_ratio=0.01,
                 tol=0.0, verbose=0),
 MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                 init_size=None, max_iter=10, max_no_improvement=10,
                 n_clusters=5, n_init=3, random_state=0, reassignment_ratio=