In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
MRI_DIR = "/kaggle/input/mri-embeddings/kaggle/working/MRI_Embeddings"
classes = ["ADNI1_T1w_Cohort_AD_Visit_12_MRI", "ADNI1_T1w_DXMCI=1_at_m18_MRI", "ADNI1_T1w_Normal_at_m12_MRI"]

In [None]:
records = []

In [None]:
for cls in classes:
    cls_path = os.path.join(MRI_DIR, cls)
    for f in os.listdir(cls_path):
        if f.endswith(".npy"):
            subj_id = os.path.splitext(f)[0]
            emb = np.load(os.path.join(cls_path, f))
            records.append({"subject_id": subj_id, "label": cls, "embedding": emb})

In [None]:
mri_df = pd.DataFrame(records)

In [None]:
mri_df

In [None]:
print(mri_df.head())

In [None]:
X_mri = np.stack([x.flatten() for x in mri_df['embedding']])
y = mri_df['label']
subjects = mri_df['subject_id']


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_mri)
plt.scatter(X_2d[:,0], X_2d[:,1], c=pd.factorize(y)[0])
plt.title("MRI Embeding")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mri, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
rf = RandomForestClassifier(n_estimators=600, random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
y_pred

In [None]:
print(" MRI", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
genetic_df = pd.read_csv("/kaggle/input/genotype-embeddings/genotype_embeddings_64_EDdyg.csv")
print(genetic_df.head())

In [None]:
genetic_df

In [None]:
print(mri_df.columns)
print(genetic_df.columns)

In [None]:
genetic_df.columns = genetic_df.columns.str.strip().str.lower()

In [None]:
merged_df = pd.merge(
    mri_df.drop(columns=['embedding']), 
    genetic_df, 
    on='subject_id'
)

In [None]:
merged_df

In [None]:
mri_emb_dict = {row.subject_id: row.embedding.flatten() for _, row in mri_df.iterrows()}
X_mri_fused = []
X_genetic = []

In [None]:
for sid in merged_df['subject_id']:
    X_mri_fused.append(mri_emb_dict[sid])
    X_genetic.append(genetic_df[genetic_df.subject_id == sid].drop(columns=['subject_id']).values.squeeze())

In [None]:
X_mri_fused = np.array(X_mri_fused)
X_genetic = np.array(X_genetic)

In [None]:
X_mri_fused

In [None]:
X_fused = np.concatenate([X_mri_fused, X_genetic], axis=1)
y_fused = merged_df['label']

In [None]:
import numpy as np
import pandas as pd

mri_df['subject_id'] = mri_df['subject_id'].str.replace('_embedding', '', regex=False)
merged_df = pd.merge(mri_df, genetic_df, on='subject_id', how='inner')

X_mri = np.stack(merged_df['embedding'].apply(np.array))
X_genetic = merged_df.loc[:, merged_df.columns.str.startswith('g_emb_')].values


X_fused = np.concatenate([X_mri, X_genetic], axis=1)
y_fused = merged_df['label']


In [None]:
X_fused

In [None]:
y_fused 

In [None]:
print(X_mri.shape, X_genetic.shape, X_fused.shape)
print(len(y_fused))
print(len(X_fused))

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_fused)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_fused, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    random_state=42,
    class_weight='balanced', 
    n_jobs=-1
)

In [None]:
rf_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = rf_model.predict(X_test_scaled)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
