In [1]:
# Imports

from PIL import Image
import tifffile
import os
import math
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Flags

# Use MPS if available?
USE_MPS = False # CNN stuff doesn't work (particularly 3D CNN) with MPS. Only CUDA works.

# Use Lower Quality features (bigger dataset but quite possibly shit data) for training?
USE_LQ_FEATURES = False

# Use ensemble method or use padding?
USE_ENSEMBLE = False

LOAD_WEIGHTS = False # Load pretrained weights if they exist?
TRAIN_MODEL = True # Train model? Can train on pre-loaded weights also
SAVE_WEIGHTS = True # Save weights after training?

ENSEMBLE_WEIGHTS = "ensemble_model_weights.pth" # Weights for ensemble learning model
PADDING_WEIGHTS = "padding_model_weights.pth" # Weights for padding based model

RANDOM_SEED = 42069 # Seed to use for all random operations

hq_csv_file_path = 'full_Table_HIGH_QUAL.csv'
lq_csv_file_path = 'full_Table_STRIPPED_CLEANED.csv'
image_folder_path = "./Images_Of_Networks/tiff/"


In [3]:
data = pd.read_csv('full_table_all_Cells_Uncleaned.csv')

# cleaning the data 
import io

csv_file_path = 'full_table_all_Cells_Uncleaned.csv'
buffer = io.StringIO()

# Open and process the CSV file, to strip entries so that numbers aren't read as strings by read_csv and column names have no leading/trailing whitespaces
with open(csv_file_path, 'r') as file:
    for line in file:
        cleaned_line = ','.join(cell.strip() for cell in line.split(','))
        buffer.write(cleaned_line + '\n')

# Move the buffer cursor to the start
buffer.seek(0)

# Read the cleaned data into pandas
data = pd.read_csv(buffer)
edge_columns = [col for col in data.columns if col.startswith('edge_') and col.split('_')[-1].isdigit()]
# Replace empty entries with 0 in these colu"mns
data[edge_columns] = data[edge_columns].fillna(0)

data['cell_group'] = data['cc_pixel_intensity_ratio']
data['image_name'] = data['folder_name_x'] + '_' + data['cc_x'].astype(str)
data['file_name'] = data['image_name'] + '.tif'


# Create the new column based on the condition
data['cell_group'] = data['cell_group'].apply(lambda x: 1 if x > 0.1 else 0)


  data['cell_group'] = data['cc_pixel_intensity_ratio']
  data['image_name'] = data['folder_name_x'] + '_' + data['cc_x'].astype(str)
  data['file_name'] = data['image_name'] + '.tif'


In [4]:
# if the column has Angle or Edge, drop it
# rename angle flag to ngle_flag

data = data.rename(columns = {'Angle_Flag': 'ngle_Flag'})
data = data.rename(columns = {'edges' : 'temp_ed'})

data = data.drop(data.columns[data.columns.str.contains('Angle')], axis = 1)
data = data.drop(data.columns[data.columns.str.contains('Edge')], axis = 1)
data = data.drop(data.columns[data.columns.str.contains('edge')], axis = 1)

# so first I wrote a script that takes foldernamex , “_” , the file index to combine it into file_name and appended .tif to it do this

data['file_name'] = data['folder_name_x'] + '_' + data['cc_x'].astype(str) + '.tif'
# rename 'clustering coefficient' to 'clustering_coefficent'
data = data.rename(columns = {'clustering coefficient': 'clustering_coefficient', 'ngle_Flag': 'Angle_Flag', 'temp_ed': 'edges'})

# do not remove the 'degree_distribution' column
# remove the Nan values from the dataframe
data = data.dropna(subset = ['degree_distribution'])
# fill Angle_Flag with 0
data['Angle_Flag'] = data['Angle_Flag'].fillna(0)

clean_df = pd.read_csv(hq_csv_file_path)

data = data.fillna(0, )

for x in data.columns: 
    if x == 'degree_distribution': 
        continue
    if x not in clean_df.columns: 
        data = data.drop(x, axis = 1)



In [5]:
# drop none in the degree distribution column
# fill nan with 0 
data 
import ast
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except:
        print(type(x))
        return x
    
data['degree_distribution'] = data['degree_distribution'].apply(safe_literal_eval)

In [6]:
max_degree = max(max(row.keys(), default=0) for row in data['degree_distribution'])

# Initialize columns with zeros
for i in range(1, max_degree + 1):
    data[f'Degree_{i}'] = 0

# Populate the columns with the counts from the degree distribution
for index, row in data.iterrows():
    for degree, count in row['degree_distribution'].items():
        data.at[index, f'degree_distribution_{degree}'] = count

# Drop the original 'degree_distribution' column
data.drop('degree_distribution', axis=1, inplace=True)

In [7]:
for x in data.columns: 
    if x not in clean_df.columns: 
        data = data.drop(x, axis = 1)
        print(x)

Degree_1
Degree_2
Degree_3
Degree_4
Degree_5
Degree_6
Degree_7
Degree_8
Degree_9
Degree_10
Degree_11


In [8]:
for y in clean_df.columns: 
    if y not in data.columns: 
        print(y)

In [9]:
data.to_csv('All_Groups.csv', index = False)