In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score, confusion_matrix

def AUC_Proteins(excel_file, remove_outliers: bool) -> pd.DataFrame:
    # Read Excel data
    data = pd.read_excel(excel_file)
    
    # Handle missing protein names
    data.columns = data.columns.fillna('Protein_name_is_missing')
    for i in range(data.columns.shape[0]):
        for j in range(data.columns.shape[0]):
            if data.columns.values[i] == 'nan.%s' % j:
                data.columns.values[i] = 'Protein_name_is_missing'
    
    results = []
    
    for i in range(1, data.columns.shape[0]):
        cases = data.iloc[data.loc[data['Group'] == 1].index, i].drop_duplicates()
        controls = data.iloc[data.loc[data['Group'] == 0].index, i].drop_duplicates()
        
        if remove_outliers:
            # Remove outliers for cases and controls
            cases = cases[((cases - cases.mean()) / cases.std()).abs() <= 2]
            controls = controls[((controls - controls.mean()) / controls.std()).abs() <= 2]
        
        # Combine the cases and controls into a single list
        labels = [1] * len(cases) + [0] * len(controls)
        # Combine the measurements for cases and controls into a single list
        variables = cases.tolist() + controls.tolist()
        
        # Compute the AUC-ROC score
        auc_roc = roc_auc_score(labels, variables)
        if auc_roc < 0.5:
            auc_roc = 1 - auc_roc
        
        # Calculate confusion matrix
        threshold = sum(variables) / len(variables)
        tn, fp, fn, tp = confusion_matrix(labels, [1 if val >= threshold else 0 for val in variables]).ravel()
        
        # Calculate sensitivity and specificity
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        
        results.append((data.columns.values[i], auc_roc, sensitivity, specificity))
    
    df = pd.DataFrame(results, columns=['Protein', 'AUC value', 'Sensitivity', 'Specificity'])
    return df



In [4]:
# Example usage
excel_file_path ='ROCSampleexcel.xlsx'
remove_outliers_flag = True  # Set this according to your requirement

result_df = AUC_Proteins(excel_file_path, remove_outliers_flag)
print(result_df)

KeyError: 'Group'

In [None]:
excel_file='PC-RocAuc.xlsx'
x=AUC_Proteins(excel_file,remove_outliers=False)
x
#x.to_csv('PCAuc_values.csv', index=False)


In [2]:
def AUC_Proteins (excel_file, remove_outliers: bool) -> bool:
    
    import pandas as pd
    import numpy as np
    from sklearn.metrics import roc_auc_score
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    pd.set_option('float_format', '{:.16f}'.format)
    data=pd.read_excel(excel_file)
    data
    #finding proteins with missing names
    data.columns = data.columns.fillna('Protein_name_is_missing')
    for i in range (data.columns.shape[0]):
        for j in range(data.columns.shape[0]):
            if data.columns.values[i] == 'nan.%s'%(j):
                data.columns.values[i] ='Protein_name_is_missing'
    x=[]
    y=[]
    if remove_outliers:
        for i in range(1,data.columns.shape[0]):
            cases = data.iloc[data.loc[data['Group'] == 1].index,i].drop_duplicates()
            # Calculate IQR for each column
            Q1 = cases.quantile(0.25)
            Q3 = cases.quantile(0.75)
            IQR = Q3 - Q1
            # Define upper and lower bounds for outliers
            upper_bound = Q3 + 1.5*IQR
            lower_bound = Q1 - 1.5*IQR
            # Remove outliers
            cases =  cases[(cases >= lower_bound) & (cases <= upper_bound)]
            cases = cases.tolist()
            controls = data.iloc[data.loc[data['Group'] == 0].index,i].drop_duplicates()
            # Calculate IQR for each column
            Q1 = controls.quantile(0.25)
            Q3 = controls.quantile(0.75)
            IQR = Q3 - Q1
            # Define upper and lower bounds for outliers
            upper_bound = Q3 + 1.5*IQR
            lower_bound = Q1 - 1.5*IQR
            # Remove outliers
            controls =  controls[(controls >= lower_bound) & (controls <= upper_bound)]
            controls = controls.tolist()
            # Combine the cases and controls into a single list
            labels = [1] * len(cases) + [0] * len(controls)
            # Combine the measurements for cases and controls into a single list
            variables = cases + controls
            # Compute the AUC-ROC score
            auc_roc = roc_auc_score(labels, variables)
            if auc_roc < 0.5:
                auc_roc = 1 - auc_roc
            x.append(data.columns.values[i])
            y.append(auc_roc)
    else:
        for i in range(1,data.columns.shape[0]):
            cases = data.iloc[data.loc[data['Group'] == 1].index,i].drop_duplicates()
            cases = cases.tolist()
            controls = data.iloc[data.loc[data['Group'] == 0].index,i].drop_duplicates()
            controls = controls.tolist()
            # Combine the cases and controls into a single list
            labels = [1] * len(cases) + [0] * len(controls)
            # Combine the measurements for cases and controls into a single list
            variables = cases + controls
            # Compute the AUC-ROC score
            auc_roc = roc_auc_score(labels, variables)
            if auc_roc < 0.5:
                auc_roc = 1 - auc_roc
            x.append(data.columns.values[i])
            y.append(auc_roc)
    df = pd.DataFrame(list(zip(x, y)),columns =['Protein', 'AUC value'])
    return df

In [3]:
excel_file='PC-RocAuc.xlsx'
x=AUC_Proteins(excel_file,remove_outliers=False)
x
#x.to_csv('PCAuc_values.csv', index=False)


Unnamed: 0,Protein,AUC value
0,NADPH-P450 Oxidoreductase,0.9583333333333331
1,Histone H2A.z,0.9444444444444444
2,IR,0.9305555555555556
3,Angiopoietin-1,0.9166666666666666
4,Elafin,0.9166666666666666
5,granzyme A,0.9166666666666666
6,TBP,0.9097222222222222
7,CATF,0.8750000000000001
8,Cystatin-S,0.875
9,"GAPDH, liver",0.8611111111111112


In [3]:
y=AUC_Proteins(excel_file,remove_outliers=True)
y

Unnamed: 0,Protein,AUC value
0,NADPH-P450 Oxidoreductase,0.9523809523809524
1,Histone H2A.z,0.9365079365079364
2,IR,1.0
3,Angiopoietin-1,0.9444444444444444
4,Elafin,0.9682539682539684
5,granzyme A,0.8928571428571428
6,TBP,0.8984375
7,CATF,0.8750000000000001
8,Cystatin-S,0.859375
9,"GAPDH, liver",0.9285714285714286
