In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import euclidean_distances
import warnings
warnings.filterwarnings('ignore')

def impute_missing_values(df, column):
    value_dist = df[column].value_counts(normalize=True)
    missing_indices = df[column].isna()

    if missing_indices.any():
        if not value_dist.empty:
            imputed_values = np.random.choice(
                value_dist.index,
                size=missing_indices.sum(),
                p=value_dist.values
            )
            df.loc[missing_indices, column] = imputed_values
    return df

def preprocess(merged_selected_df, harvard_selected_df):
    merged = merged_selected_df.copy()
    harvard = harvard_selected_df.copy()

    rename_map_harvard = {}
    if 'Donor Ethnicity' in harvard.columns:
        rename_map_harvard['Donor Ethnicity'] = 'Race'
    if 'Donor Age' in harvard.columns:
        rename_map_harvard['Age'] = 'Age'
    if 'T Stage' in harvard.columns:
        pass
    harvard = harvard.rename(columns=rename_map_harvard)

    rename_map_merged = {}
    if 'T Stage ' in merged.columns:
        rename_map_merged['T Stage '] = 'T Stage'
    if 'Race' in merged.columns:
        pass
    if 'Age' in merged.columns:
        pass
    merged = merged.rename(columns=rename_map_merged)

    cols_to_impute_merged = ['Race', 'T Stage']
    cols_to_impute_harvard = ['Race', 'T Stage']

    for col in cols_to_impute_merged:
        if col in merged.columns:
            merged = impute_missing_values(merged, col)

    for col in cols_to_impute_harvard:
        if col in harvard.columns:
            harvard = impute_missing_values(harvard, col)

    if 'Age' in merged.columns:
        merged['Age'] = pd.to_numeric(merged['Age'], errors='coerce')
        age_mean_merged = merged['Age'].mean()
        merged['Age'] = merged['Age'].fillna(age_mean_merged)

    if 'Age' in harvard.columns:
        harvard['Age'] = pd.to_numeric(harvard['Age'], errors='coerce')
        age_mean_harvard = harvard['Age'].mean()
        harvard['Age'] = harvard['Age'].fillna(age_mean_harvard)

    numerical_features = ['Age'] if 'Age' in merged.columns and 'Age' in harvard.columns else []
    categorical_features = []
    if 'Race' in merged.columns and 'Race' in harvard.columns:
        categorical_features.append('Race')
    if 'T Stage' in merged.columns and 'T Stage' in harvard.columns:
        categorical_features.append('T Stage')

    transformers = []
    if numerical_features:
        for col in numerical_features:
            if not pd.api.types.is_numeric_dtype(merged[col]) or not pd.api.types.is_numeric_dtype(harvard[col]):
                merged[col] = pd.to_numeric(merged[col], errors='coerce')
                harvard[col] = pd.to_numeric(harvard[col], errors='coerce')
                if merged[col].isnull().any(): merged[col].fillna(merged[col].mean(), inplace=True)
                if harvard[col].isnull().any(): harvard[col].fillna(harvard[col].mean(), inplace=True)
        transformers.append(('num', StandardScaler(), numerical_features))

    if categorical_features:
        for col in categorical_features:
            merged[col] = merged[col].astype(str)
            harvard[col] = harvard[col].astype(str)
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features))

    preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
    cols_for_preprocessing = numerical_features + categorical_features

    merged_processed = preprocessor.fit_transform(merged[cols_for_preprocessing])

    if all(col in harvard.columns for col in cols_for_preprocessing):
        harvard_processed = preprocessor.transform(harvard[cols_for_preprocessing])
    else:
        return merged_processed, np.array([])

    return merged_processed, harvard_processed

def similarity(merged_processed, harvard_processed):
    if merged_processed.shape[0] == 0 or harvard_processed.shape[0] == 0 or \
       merged_processed.shape[1] == 0 or harvard_processed.shape[1] == 0:
        return np.array([])

    if merged_processed.shape[1] != harvard_processed.shape[1]:
        return np.array([])

    distances = euclidean_distances(merged_processed, harvard_processed)
    similarity_matrix = 1 / (1 + distances + 1e-9)
    return similarity_matrix

def best_matches(similarity_matrix, merged_original_df, harvard_original_df, top_n=3):
    if similarity_matrix.size == 0:
        return []

    results = []

    if 'Patient ID' not in merged_original_df.columns:
        return []
    if 'HMS LINCS ID' not in harvard_original_df.columns:
        return []

    for i, patient_id in enumerate(merged_original_df['Patient ID']):
        if i >= similarity_matrix.shape[0]:
            continue

        patient_similarity_scores = similarity_matrix[i]
        current_top_n = min(top_n, len(patient_similarity_scores))
        if current_top_n == 0:
            continue

        top_indices = np.argsort(patient_similarity_scores)[-current_top_n:][::-1]

        matches = []
        for idx in top_indices:
            if idx >= len(harvard_original_df):
                continue

            cell_line_info = harvard_original_df.iloc[idx]
            similarity_score = patient_similarity_scores[idx]

            matches.append({
                'cell_line': cell_line_info.get('HMS LINCS ID', 'N/A'),
                'similarity_score': similarity_score,
                'age': cell_line_info.get('Donor Age', 'N/A'),
                'race': cell_line_info.get('Donor Ethnicity', 'N/A'),
                't_stage': cell_line_info.get('T Stage', 'N/A')
            })

        results.append({
            'patient_id': patient_id,
            'matches': matches
        })

    return results

def cellline_alignment():
    merged_df_full = pd.read_csv('final_merged.csv', low_memory=False)
    harvard_df_full = pd.read_csv('HarvardCellLines.csv', low_memory=False)

    merged_cols = ['Patient ID', 'Age', 'Race', 'T Stage ']
    harvard_cols = ['HMS LINCS ID', 'Donor Age', 'Donor Ethnicity', 'T Stage']

    merged_selected = merged_df_full[merged_cols]
    harvard_original_selected = harvard_df_full[harvard_cols].copy()
    harvard_selected = harvard_df_full[harvard_cols]

    merged_processed, harvard_processed = preprocess(merged_selected, harvard_selected)

    if merged_processed.size == 0 or harvard_processed.size == 0:
        return

    similarity_matrix = similarity(merged_processed, harvard_processed)


    results = best_matches(similarity_matrix, merged_selected, harvard_original_selected)

    if not results:
        return

    output_data = []
    for result in results:
        if 'matches' in result and isinstance(result['matches'], list):
            for match in result['matches']:
                if isinstance(match, dict):
                    output_data.append({
                        'Patient_ID': result.get('patient_id', 'N/A'),
                        'Cell_Line': match.get('cell_line', 'N/A'),
                        'Similarity_Score': match.get('similarity_score', np.nan),
                        'Cell_Line_Age': match.get('age', 'N/A'),
                        'Cell_Line_Race': match.get('race', 'N/A'),
                        'Cell_Line_T_Stage': match.get('t_stage', 'N/A')
                    })

    if output_data:
        output_df = pd.DataFrame(output_data)
        output_df.to_csv('cell_line_matches.csv', index=False)

cellline_alignment()