In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import glob
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm_notebook
import pickle
from time import sleep


In [None]:
data_files = glob.glob('gdrive/My Drive/ML Project/Normalized Data/*.txt')

# Scaling

In [None]:
def fit_scaler_and_save(scaler, output_scaler_name):
  """
    Fits the input scaler and saves the outputs
    data_files needs to be a global variable.
    Args:
        scaler: a Scaler obejct to fit
        output_scaler_name: The folder in which to save the pickled
        scalers
    
  """
  i = 0
  for file in tqdm_notebook(data_files):
      df = pd.read_csv(file,sep='\t')
      scaled_parameters = df.iloc[:,7:]
      scaler.partial_fit(scaled_parameters)
      i+=1
      if i%15 == 0:
        print(f'Saved {output_scaler_name} after {i} files.')
        with open(f'gdrive/My Drive/ML Project/{output_scaler_name}.pickle', 'wb') as pickled_scaler:
          pickle.dump(scaler, pickled_scaler)
  with open(f'gdrive/My Drive/ML Project/{output_scaler_name}.pickle', 'wb') as pickled_scaler:
    pickle.dump(scaler, pickled_scaler)
  
def transform_and_save(scaler, output_dir):
  """
  Transform all data_files (which needs to be a global variable)
  by the fitted scaler object
  Args:
      scaler: A Scaler object
      output_dir: where to save the data

  """
  for file in tqdm_notebook(data_files):
    df = pd.read_csv(file,'\t')
    last_dot = file.rfind('.')
    new_name = file[:last_dot]+" - Scaled"+file[last_dot:]
    new_name = new_name.replace('Normalized Data', output_dir)
    scaled_parameters = df.iloc[:,7:]
    final_df = pd.concat([df.iloc[:,:7],pd.DataFrame(scaler.transform(scaled_parameters), columns=list(scaled_parameters.columns))],axis=1)
    final_df.to_csv(new_name)

### Standard Scaler


In [None]:
scaler = StandardScaler()
fit_scaler_and_save(scaler, 'stadard_scaler')

In [None]:
# Load trained Scaler into memory
with open('gdrive/My Drive/ML Project/stadard_scaler.pickle', 'rb') as pickled_scaler:
    scaler = pickle.load(pickled_scaler)

In [None]:
transform_and_save(scaler, 'Normalized and scaled data')

### MinMaxScaler

In [None]:
scaler = MinMaxScaler()
fit_scaler_and_save(scaler, 'MinMax_scaler')

In [None]:
transform_and_save(scaler, 'Normalized and scaled data (MinMax Scaler)')

# Mini Batch K Means

## Run MiniBatch K Means

In [7]:
from sklearn.cluster import MiniBatchKMeans
import numpy as np

FILES_IN_BATCH=3
BATCH_SIZE = 100
MAX_ITER = 10
MAX_CLUSTERS=10


In [8]:
def concat_files(file_list: list) -> pd.DataFrame:
  '''
  Given a list of indices in SCALED_FILES 
  return one concatenated DataFrame of all of the files
  '''
  li = []

  for file in file_list:
      df = pd.read_csv(file, index_col=None, header=0)
      li.append(df)

  return pd.concat(li, axis=0, ignore_index=True)

def get_new_df(num_of_files: int = FILES_IN_BATCH):
  """
  Out of the global variable SCALED_FILES choose num_of_files files at random
  and concatenate them into one dataframe
   
  Args:
      num_of_files: The number of files to concatenate

  Returns:
    One large dataframe
  """
  actual_file_num = min(num_of_files, len(SCALED_FILES))
  file_list = np.random.choice(SCALED_FILES, actual_file_num)
  df = concat_files(file_list)
  for file in file_list:
    try:
      SCALED_FILES.remove(file)
    except Exception:
      print(file)
  return df

In [9]:
iteration_counter = 0
SCALED_FILES = glob.glob('gdrive/My Drive/ML Project/Normalized and scaled data (MinMax Scaler)/*')
kmeans_array = []
for k in range(2,MAX_CLUSTERS+1):
  kmeans_array.append(MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=BATCH_SIZE, max_iter=MAX_ITER))

In [10]:
### IF NEEDED TO LOAD BACK DATA:
# kmeans_array = []
# for k in range(2,MAX_CLUSTERS+1):
#   with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'rb') as pickled_kmeans:
#         kmeans_array.append(pickle.load(pickled_kmeans))

# with open('gdrive/My Drive/ML Project/files_left.pickle', 'rb') as files_left:
#   SCALED_FILES = pickle.load(files_left)

In [11]:
with tqdm_notebook(total=len(SCALED_FILES)) as pbar:
  while SCALED_FILES:
    df = get_new_df()
    features_only = df.iloc[:,8:]
    features_only.fillna(0, inplace=True)  # As this is Standard Scaller 0 is the mean for other scaller maybe dropna would be better.
    for kmeans in kmeans_array:
      kmeans.partial_fit(features_only)
    if iteration_counter%15 == 0:
      for k in range(2,MAX_CLUSTERS+1):
        with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'wb') as pickled_kmeans:
          pickle.dump(kmeans, pickled_kmeans)
      with open('gdrive/My Drive/ML Project/files_left.pickle', 'wb') as files_left:
        pickle.dump(SCALED_FILES, files_left)
    pbar.update(FILES_IN_BATCH)

    iteration_counter+=1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm_notebook(total=len(SCALED_FILES)) as pbar:


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [12]:
# Save kmeans
for k in range(2,MAX_CLUSTERS+1):
  with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'wb') as pickled_kmeans:
    pickle.dump(kmeans, pickled_kmeans)

FileNotFoundError: [Errno 2] No such file or directory: 'gdrive/My Drive/ML Project/minibatchkmeans_minmax_2.pickle'

## Apply K-Means

In [None]:
def scale_file(file:str):
  print("started")
  df = pd.read_csv(file)
  df_info = df.iloc[:,:8]
  df_parameters = df.iloc[:,8:].fillna(df.mean())
  for k in range(2,MAX_CLUSTERS+1):
    calssify_by_kmeans= kmeans_array[k-2].predict(df_parameters)
    df_info = pd.concat([df_info,pd.Series(calssify_by_kmeans, name=f'K={k}')],axis=1)
  name = file.split('/')[-1].replace('.txt', '.csv')
  df_info.to_csv(save_dir+'/'+name)
  print(f"saved {save_dir+'/'+name}")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
# Load kmeans into memory
kmeans_array = []
for k in range(2,MAX_CLUSTERS+1):
  with open(f'gdrive/My Drive/ML Project/minibatchkmeans_minmax_{k}.pickle', 'rb') as pickled_kmeans:
        kmeans_array.append(pickle.load(pickled_kmeans))

SCALED_FILES = glob.glob('gdrive/My Drive/ML Project/Normalized and scaled data (MinMax Scaler)/*')
save_dir = 'gdrive/My Drive/ML Project/Kmean Mini Batch (MinMax)'
for file in tqdm_notebook(SCALED_FILES):
  df = pd.read_csv(file)
  df_info = df.iloc[:,:8]
  df_parameters = df.iloc[:,8:].fillna(df.mean())
  for k in range(2,MAX_CLUSTERS+1):
    calssify_by_kmeans= kmeans_array[k-2].predict(df_parameters)
    df_info = pd.concat([df_info,pd.Series(calssify_by_kmeans, name=f'K={k}')],axis=1)
  name = file.split('/')[-1].replace('.txt', '.csv')
  df_info.to_csv(save_dir+'/'+name)
  print(f"saved {save_dir+'/'+name}")
print('Saved All Files')

From nuclei clustering extract imgae clustering

In [None]:
print("hello")
NUCLEI_KMEANS_RESULTS = glob.glob('gdrive/Shared with me/ML Project/ron_test/*')
save_dir = '/My Drive/ML Project/ron_test/results1'
for file in tqdm_notebook(NUCLEI_KMEANS_RESULTS):
  print("1")
  df = pd.read_csv(file)
  df_info = df.iloc[:,:8]
  print(df_info)
  print("hello")

