In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np


COLS = ['Race', 'Age', 'T Stage']
Patient_ID = 'Patient ID'
Cell_Line_ID = 'Cell Line ID'
output_file = 'raw_mapping.csv'

merged_df = pd.read_csv('final_merged.csv')
cell_lines_df = pd.read_csv('HarvardCellLines.csv')
    
merged_df.columns = merged_df.columns.str.strip()
cell_lines_df.columns = cell_lines_df.columns.str.strip()

required_patient_cols = COLS + [Patient_ID]
required_cell_line_cols = COLS + [Cell_Line_ID]

patients_features = merged_df[COLS].copy()
cell_lines_features = cell_lines_df[COLS].copy()

categorical_features = ['Race', 'T Stage']
numerical_features = ['Age']

patients_features['Age'] = pd.to_numeric(patients_features['Age'], errors='coerce')
cell_lines_features['Age'] = pd.to_numeric(cell_lines_features['Age'], errors='coerce')

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' 
)

cell_lines_processed = preprocessor.fit_transform(cell_lines_features)
patients_processed = preprocessor.transform(patients_features)
nn_model = NearestNeighbors(n_neighbors=1, metric='euclidean', algorithm='auto')
nn_model.fit(cell_lines_processed)
distances, indices = nn_model.kneighbors(patients_processed)

nearest_indices = indices.flatten()
nearest_distances = distances.flatten()

matched_cell_line_ids = cell_lines_df.loc[nearest_indices, Cell_Line_ID].values

results_df = pd.DataFrame({
    Patient_ID: merged_df[Patient_ID],
    'Matched ' + Cell_Line_ID: matched_cell_line_ids,
    'Similarity Distance': nearest_distances
})
results_df.to_csv(output_file, index=False)


In [2]:
def fliter_mapping(input_csv_path, output_csv_path=None):
    
    df = pd.read_csv(input_csv_path)

    required_columns = ['Patient ID', 'Matched Cell Line ID', 'Similarity Distance']

    df['Similarity Distance'] = pd.to_numeric(df['Similarity Distance'], errors='coerce')
    df.dropna(subset=['Similarity Distance'], inplace=True)

    df_sorted = df.sort_values(by=['Patient ID', 'Similarity Distance'], ascending=[True, False])

    df_filtered = df_sorted.groupby('Patient ID').first().reset_index()

    if 'Matched Cell Line ID' in df.columns:
        df_filtered = df_filtered[['Patient ID', 'Matched Cell Line ID', 'Similarity Distance'] + [col for col in df.columns if col not in ['Patient ID', 'Matched Cell Line ID', 'Similarity Distance']]]

    if output_csv_path:
        df_filtered.to_csv(output_csv_path, index=False)

    return df_filtered


In [3]:
input_file = 'raw_mapping.csv'
output_file = 'final_mapping.csv'

filtered_data = fliter_mapping(input_file, output_file)