In [1]:
import pandas as pd
import numpy as np

In [2]:
# filter_df
file_path = 'KIDPAN_DATA.DAT'
separator = '\t'  

df1 = pd.read_csv(file_path, sep=separator, on_bad_lines='skip', nrows=1000)

rows = list(range(1000))
cols = [433, 295, 25, 26, 176, 201, 199, 4, 168, 169, 170, 171, 308] 

selected_data1 = df1.iloc[rows, cols]

column_name = 'Unnamed: 433'

filtered_df = selected_data1.dropna(subset=[column_name])

filtered_df.columns = ['TRR_ID_CODE', 'age', 'gender', 'ABO', 'age_don', 'gender_don', 'ABO_don', 'PRA', 'AMIS', 'BMIS', 'DRMIS', 'HLAMIS', 'GSTATUS_KI']

filtered_df = filtered_df[filtered_df['GSTATUS_KI'] != '.']

filtered_df.to_csv('filtered_df.csv', index=False)

In [3]:
import random
import numpy as np
from sklearn.cluster import KMeans
from src.lfgp import LFGP
from collections import defaultdict
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [70]:
# mismatch_result
data = filtered_df
# Convert mismatch indicators from object (string) to integer
mismatch_cols = ['PRA', 'AMIS', 'BMIS', 'DRMIS', 'HLAMIS']
data[mismatch_cols] = data[mismatch_cols].apply(pd.to_numeric, errors='coerce')

# Recalculate the quantiles now that the data types are corrected
quantiles = data[mismatch_cols].quantile([0.33, 0.66])

# Function to classify results based on quantiles for PRA differently
def classify_pra(value, high):
    if value > high:
        return 0  # PRA uses opposite condition, high value classified as 0
    else:
        return 1  # Low and middle values classified as 1

# Function to classify results based on quantiles for other indicators
def classify_result(value, low):
    if value > low:
        return 1  # Other indicators, low value classified as 1
    else:
        return 0  # Middle and high values classified as 0

# Dictionary for mapping indicators to an index
indicator_mapping = {'PRA': 1, 'AMIS': 2, 'BMIS': 3, 'DRMIS': 4, 'HLAMIS': 5}

# Apply the classification function again
results = []
for col in mismatch_cols:
    # Get quantiles for each column
    low, high = quantiles.loc[0.33, col], quantiles.loc[0.66, col]
    # Create a new DataFrame with the classified results
    temp_df = data[['TRR_ID_CODE']].copy()
    temp_df['Mismatch_Indicator'] = col
    if col == 'PRA':
        temp_df['Transplantation_Result'] = data[col].apply(classify_pra, args=(high,))
    else:
        temp_df['Transplantation_Result'] = data[col].apply(classify_result, args=(low,))
    results.append(temp_df)

# Concatenate results for all mismatch indicators and create the final structured data
final_data = pd.concat(results, ignore_index=True)
final_data['Mismatch_Index'] = final_data['Mismatch_Indicator'].map(indicator_mapping)
mismatch_results = final_data[['TRR_ID_CODE', 'Mismatch_Index', 'Transplantation_Result']]
kidney_transplant = filtered_df[['TRR_ID_CODE', 'GSTATUS_KI']]

In [72]:
def modify_first_column(df):
    df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: int(x[1:]) if isinstance(x, str) and x.startswith('A') else x)
    return df

# Modify the first columns
mismatch_results_modified = modify_first_column(mismatch_results)
kidney_transplant_modified = modify_first_column(kidney_transplant)

# Create mappings for the first columns from the modified dataframes
mismatch_mapping = {val: idx for idx, val in enumerate(sorted(mismatch_results_modified.iloc[:, 0].unique()))}
kidney_mapping = {val: idx for idx, val in enumerate(sorted(kidney_transplant_modified.iloc[:, 0].unique()))}

# Apply the mappings to the first columns
mismatch_results_final = mismatch_results_modified.copy()
kidney_transplant_final = kidney_transplant_modified.copy()

mismatch_results_final.iloc[:, 0] = mismatch_results_modified.iloc[:, 0].map(mismatch_mapping)
kidney_transplant_final.iloc[:, 0] = kidney_transplant_modified.iloc[:, 0].map(kidney_mapping)

# Save the final modified files
mismatch_results_final.to_csv("mismatch_results_final.txt", header=False, index=False)
kidney_transplant_final.to_csv("kidney_transplant_final.txt", header=False, index=False)

In [73]:
rating = np.loadtxt("mismatch_results_final.txt", delimiter=',')
label = np.loadtxt('kidney_transplant_final.txt', delimiter=',')

print("number of tasks: {0}".format(len(np.unique(rating[:, 0]))))
print("number of workers: {0}".format(len(np.unique(rating[:, 1]))))
print("number of crowd labels: {0}".format(len(rating[:, 0])))

number of tasks: 544
number of workers: 5
number of crowd labels: 2720


In [80]:
model = LFGP(lf_dim=5, n_worker_group=2, lambda1=0.1, lambda2=0.1)
model._prescreen(rating)


np.random.seed(0) # for the purpose of reproducibility, fix the seed for initialization
model._mc_fit(rating, epsilon=1e-4, scheme="mv", maxiter=80, verbose=0)
pred_label = model._mc_infer(rating)

print(np.mean(np.equal(label[:, 1], pred_label[:, 1])))

0.5919117647058824
