In [1]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_excel('/nfs/cc-filer/home/sabulikailik/new_images/A1818/Processed_Images_Data18_filtered.xlsx')


# Print the number of rows before removing duplicates
print("Number of rows before removing duplicates:")
print(f"df1: {df.shape[0]}")
# Remove duplicates from each DataFrame
df = df.drop_duplicates()
# Print the number of rows after removing duplicates
print("\nNumber of rows after removing duplicates:")
print(f"df1: {df.shape[0]}")



# Define cluster centers for each feature
cluster_centers = {
    'area': {1: np.array([411]), 2: np.array([805])},
    'pleomorphism': {1: np.array([0.9]), 2: np.array([0.95])},
    'elongation': {1: np.array([0.54]), 2: np.array([0.76])},
    'mean_intensity_DAPI': {1: np.array([819]), 2: np.array([1411])},
    'total_intensity_DAPI': {1: np.array([429732]), 2: np.array([1048899])},
    'avg_edge_length': {1: np.array([40.69]), 2: np.array([121.11])},
    'median_K14': {1: np.array([123]), 2: np.array([664.5])},
    'median_K18': {1: np.array([293]), 2: np.array([475])}
}


Number of rows before removing duplicates:
df1: 455568

Number of rows after removing duplicates:
df1: 455568


In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import json

def parse_cellularity(cellularity_str):
    try:
        # Convert the string to a dictionary
        cellularity_dict = json.loads(cellularity_str.replace("'", '"'))

        # Calculate the number of neighbors
        num_neighbors = len(cellularity_dict['neighbor_names'])

        # Calculate the average edge length
        if len(cellularity_dict['edge_lengths']) > 0:
            avg_edge_length = sum(cellularity_dict['edge_lengths']) / len(cellularity_dict['edge_lengths'])
        else:
            avg_edge_length = 0  # Avoid division by zero if there are no edges

        return num_neighbors, avg_edge_length
    except json.JSONDecodeError:
        return None, None
    
df[['number_of_neighbors', 'avg_edge_length']] = df['cellularity'].apply(lambda x: parse_cellularity(x)).tolist()


In [3]:
import pandas as pd
import numpy as np

# Function to calculate Euclidean distance
def euclidean_distance(x, y):
    return np.linalg.norm(x - y)

# Function to assign clusters based on Euclidean distance
def assign_clusters(value, cluster_means):
    distances = {cluster: euclidean_distance(value, mean) for cluster, mean in cluster_means.items()}
    return [1 if distances[i] == min(distances.values()) else 0 for i in sorted(distances)]

# Iterate over the cluster centers and update DataFrame with cluster assignments
for feature, centers in cluster_centers.items():
    df[[feature + '_1', feature + '_2']] = df[feature].apply(
        lambda x: pd.Series(assign_clusters(np.array([x]), centers))
    )



In [4]:
df.to_excel('Processed_Images_Data18K_filtered_clustered.xlsx', index=False)