In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, mean_squared_error, roc_auc_score
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
import numpy as np
# Read the CSV file into a DataFrame
label_encoder = LabelEncoder()
Data = pd.read_csv(r"C:\Users\daddy\Downloads\full_df.csv\full_df.csv")
# Function to remove characters ' or [ or ] from a string
def remove_chars(s):
    if isinstance(s, str):
        return s.replace("'", "").replace("[", "").replace("]", "")
    else:
        return s
# Apply the remove_chars function to each element in the DataFrame
Data = Data.applymap(remove_chars)
# Rename the columns in the desired order
Data.rename(columns={
    'N': 'Normal',
    'D': 'Diabetes',
    'G': 'Glaucoma',
    'C': 'Cataract',
    'A': 'Age related Macular Degeneration',
    'H': 'Hypertension',
    'M': 'Pathological Myopia',
    'O': 'Other diseases/abnormalities'
}, inplace=True)
Data = Data.drop(['Right-Fundus'], axis=1)
Data = Data.drop(['Left-Fundus'], axis=1)
Data = Data.drop(['filepath'], axis=1)
Data = Data.drop(['target'], axis=1)
Data = Data.drop(['filename'], axis=1)
Data = Data.drop(['labels'], axis=1)
Data = Data.drop(['ID'], axis=1)
Data = pd.DataFrame(Data)
# Function to create the interactions between diagnoses
def create_diagnosis_interactions(row):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    common_diagnoses = left_diagnoses.intersection(right_diagnoses)
    return '|'.join(common_diagnoses)

# Apply the function to create the new feature
Data['Diagnosis_Interactions'] = Data.apply(create_diagnosis_interactions, axis=1)
# Function to count common and unique keywords
def count_common_unique_keywords(row):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    common_keywords = left_diagnoses.intersection(right_diagnoses)
    unique_left_keywords = left_diagnoses.difference(right_diagnoses)
    unique_right_keywords = right_diagnoses.difference(left_diagnoses)
    return len(common_keywords), len(unique_left_keywords), len(unique_right_keywords)

# Apply the function to create the new features
Data['Common_Keywords'], Data['Unique_Left_Keywords'], Data['Unique_Right_Keywords'] = zip(*Data.apply(count_common_unique_keywords, axis=1))
def create_age_group(age):
    if age < 30:
        return 'Young'
    elif age < 60:
        return 'Middle-aged'
    else:
        return 'Senior'

# Apply the function to create the new feature
Data['Age_Group'] = Data['Patient Age'].apply(create_age_group)

# Encode the new feature as numeric using LabelEncoder
Data['Age_Group'] = label_encoder.fit_transform(Data['Age_Group'])
# Function to count total number of diagnoses
def count_total_diagnoses(row):
    left_diagnoses = row['Left-Diagnostic Keywords'].count('，') + 1
    right_diagnoses = row['Right-Diagnostic Keywords'].count('，') + 1
    return left_diagnoses + right_diagnoses

# Apply the function to create the new feature
Data['Total_Diagnoses'] = Data.apply(count_total_diagnoses, axis=1)
# Function to calculate diagnosis frequency
def calculate_diagnosis_frequency(row, diagnosis):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    return list(left_diagnoses).count(diagnosis) + list(right_diagnoses).count(diagnosis)

# List of unique diagnoses in the dataset
left_diagnoses_list = Data['Left-Diagnostic Keywords'].str.split('，').tolist()
right_diagnoses_list = Data['Right-Diagnostic Keywords'].str.split('，').tolist()

unique_diagnoses = list(set([item for sublist in left_diagnoses_list for item in sublist] +
                           [item for sublist in right_diagnoses_list for item in sublist]))

# Create new features for diagnosis frequency
for diagnosis in unique_diagnoses:
    Data[f'Frequency_{diagnosis}'] = Data.apply(lambda row: calculate_diagnosis_frequency(row, diagnosis), axis=1)
# Function to calculate eye keywords overlap
def calculate_eye_overlap(row):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    return len(left_diagnoses.intersection(right_diagnoses))

# Apply the function to create the new feature for eye keyword overlapping
Data['Eye_Overlap'] = Data.apply(calculate_eye_overlap, axis=1)
# Function to calculate patient keywords ratio feature
def calculate_patient_keywords_ratio(row):
    left_diagnoses = row['Left-Diagnostic Keywords'].split('，')
    right_diagnoses = row['Right-Diagnostic Keywords'].split('，')
    return len(left_diagnoses) / len(right_diagnoses) if len(right_diagnoses) != 0 else len(left_diagnoses)

# Apply the function to create the new feature
Data['Keywords_Ratio'] = Data.apply(calculate_patient_keywords_ratio, axis=1)
# Create Blood Pressure Keywords as a feature
def has_hypertension_keyword(row):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    hypertension_keywords = ['Hypertension', 'Pathological Myopia']
    return int(any(keyword in left_diagnoses or keyword in right_diagnoses for keyword in hypertension_keywords))

Data['Blood_Pressure_Keywords'] = Data.apply(has_hypertension_keyword, axis=1)
# Age-Related Keywords feature
def has_age_related_keyword(row):
    left_diagnoses = set(row['Left-Diagnostic Keywords'].split('，'))
    right_diagnoses = set(row['Right-Diagnostic Keywords'].split('，'))
    return int('Age related Macular Degeneration' in left_diagnoses or 'Age related Macular Degeneration' in right_diagnoses)
#Utilize the count of total diseases per patient as a feature
def count_total_diseases(row):
    left_diagnoses = row['Left-Diagnostic Keywords'].split('，')
    right_diagnoses = row['Right-Diagnostic Keywords'].split('，')
    all_diagnoses = left_diagnoses + right_diagnoses
    return len(set(all_diagnoses))
# Apply the function to create the new feature 'Total_Diseases'
Data['Total_Diseases'] = Data.apply(count_total_diseases, axis=1)
Data['Age_Related_Keywords'] = Data.apply(has_age_related_keyword, axis=1)
# Diabetes and Glaucoma Interaction feature
Data['Diabetes_Glaucoma_Interaction'] = Data['Diabetes'] & Data['Glaucoma']
Data

Unnamed: 0,Patient Age,Patient Sex,Left-Diagnostic Keywords,Right-Diagnostic Keywords,Normal,Diabetes,Glaucoma,Cataract,Age related Macular Degeneration,Hypertension,...,Frequency_silicone oil eye,Frequency_old chorioretinopathy,Frequency_choroidal nevus,Frequency_tessellated fundus,Eye_Overlap,Keywords_Ratio,Blood_Pressure_Keywords,Total_Diseases,Age_Related_Keywords,Diabetes_Glaucoma_Interaction
0,69,Female,cataract,normal fundus,0,0,0,1,0,0,...,0,0,0,0,0,1.0,0,2,0,0
1,57,Male,normal fundus,normal fundus,1,0,0,0,0,0,...,0,0,0,0,1,1.0,0,1,0,0
2,42,Male,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,...,0,0,0,0,1,2.0,0,2,0,0
3,53,Male,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,...,0,0,0,0,0,1.0,0,2,0,0
4,50,Female,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,...,0,0,0,0,1,1.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,63,Male,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,...,0,0,0,0,0,1.0,0,2,0,0
6388,42,Male,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,...,0,0,0,0,1,1.0,0,1,0,0
6389,54,Male,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,...,0,0,0,0,0,1.0,0,2,0,0
6390,57,Male,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,...,0,0,0,0,1,1.0,0,1,0,0


In [2]:
Data.columns

Index(['Patient Age', 'Patient Sex', 'Left-Diagnostic Keywords',
       'Right-Diagnostic Keywords', 'Normal', 'Diabetes', 'Glaucoma',
       'Cataract', 'Age related Macular Degeneration', 'Hypertension',
       ...
       'Frequency_silicone oil eye', 'Frequency_old chorioretinopathy',
       'Frequency_choroidal nevus', 'Frequency_tessellated fundus',
       'Eye_Overlap', 'Keywords_Ratio', 'Blood_Pressure_Keywords',
       'Total_Diseases', 'Age_Related_Keywords',
       'Diabetes_Glaucoma_Interaction'],
      dtype='object', length=120)

In [3]:
# Convert desired columns to numeric values
Data['Diagnosis_Interactions'] = label_encoder.fit_transform(Data['Diagnosis_Interactions'])
Data['Patient Sex'] = label_encoder.fit_transform(Data['Patient Sex'])
Data['Left-Diagnostic Keywords'] = label_encoder.fit_transform(Data['Left-Diagnostic Keywords'])
Data['Right-Diagnostic Keywords'] = label_encoder.fit_transform(Data['Right-Diagnostic Keywords'])
# Calculate the correlation between 'Patient Age' and 'Patient Sex'
Data['Age_Sex_Correlation'] = Data['Patient Age'].corr(Data['Patient Sex'])
# Convert all columns to integers
Data = Data.astype(int)
# Update the features (X) and target (y)
X = Data[['Patient Age', 'Left-Diagnostic Keywords', 'Right-Diagnostic Keywords', 'Normal', 'Diabetes', 'Glaucoma', 
          'Cataract', 'Age related Macular Degeneration', 'Hypertension', 'Pathological Myopia', 'Other diseases/abnormalities', 
          'Diagnosis_Interactions', 'Common_Keywords', 'Unique_Left_Keywords', 'Unique_Right_Keywords', 'Age_Group', 'Age_Sex_Correlation','Total_Diseases', 'Blood_Pressure_Keywords',
         'Diabetes_Glaucoma_Interaction']
        + [f'Frequency_{diagnosis}' for diagnosis in unique_diagnoses]]
y = Data['Patient Sex']  # Target variable is 'Patient Sex' (0 for male, 1 for female)
# Calculate probabilities of each class (0 for male, 1 for female)
clf = RandomForestClassifier(n_estimators=100, random_state=300)
probs = clf.fit(X, y).predict_proba(X)

# Add probabilities to X as new columns as new features
X_with_probs = np.hstack((X, probs))
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_with_probs)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=300)
# Apply SMOTEENN to balance the class distribution in the training set to achieve better results
smote_enn = SMOTEENN(random_state=100)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train, y_train)
# Train the classifier on the balanced training set
clf = RandomForestClassifier(n_estimators=100, random_state=300, class_weight='balanced')
clf.fit(X_train_balanced, y_train_balanced)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
# Cross-Validation
cv_scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='roc_auc')
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
#Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
# Calculate Area Under the Receiver Operating Characteristic Curve (AUC)
auc = roc_auc_score(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Area Under the ROC Curve (AUC):", auc)
print("Cross-Validation Scores (AUC):", cv_scores)

Accuracy: 0.8076622361219703
Confusion Matrix:
 [[483 122]
 [124 550]]
Mean Absolute Error (MAE): 0.1923377638780297
Mean Squared Error (MSE): 0.1923377638780297
Area Under the ROC Curve (AUC): 0.80718542315521
Cross-Validation Scores (AUC): [0.97561995 0.75741724 0.94110082 0.89895004 0.80327298]


In [4]:
Data

Unnamed: 0,Patient Age,Patient Sex,Left-Diagnostic Keywords,Right-Diagnostic Keywords,Normal,Diabetes,Glaucoma,Cataract,Age related Macular Degeneration,Hypertension,...,Frequency_old chorioretinopathy,Frequency_choroidal nevus,Frequency_tessellated fundus,Eye_Overlap,Keywords_Ratio,Blood_Pressure_Keywords,Total_Diseases,Age_Related_Keywords,Diabetes_Glaucoma_Interaction,Age_Sex_Correlation
0,69,0,4,151,0,0,0,1,0,0,...,0,0,0,0,1,0,2,0,0,0
1,57,1,146,151,1,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
2,42,1,76,110,0,1,0,0,0,0,...,0,0,0,1,2,0,2,0,0,0
3,53,1,94,102,0,1,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0
4,50,0,119,110,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,63,1,166,170,0,1,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0
6388,42,1,119,110,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
6389,54,1,107,151,0,1,0,0,0,0,...,0,0,0,0,1,0,2,0,0,0
6390,57,1,107,102,0,1,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
