In [196]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
import numpy as np
import pandas as pd
import io
import pkgutil
import sys


state=input("Enter state: ")

def read_seer_csv(csv_file, drop_cols=None):
    """Helper function to load and preprocess the SEER dataset.

    """
    data = pkgutil.get_data(__name__, csv_file)
    data = pd.read_csv(io.BytesIO(data), low_memory=False)

    if drop_cols is not None:
        if 'Patient ID' in data.columns:
            drop_cols.append('Patient ID')
        data = data.drop(drop_cols, axis=1)
    else:
        if 'Patient ID' in data.columns:
            drop_cols = ['Patient ID']
            data.drop(drop_cols, axis=1, inplace=True)

    return data

x_er = read_seer_csv(f'datasets/ConditionalER_2_breast condition_{state}_modified.csv',drop_cols=['County','State'])





# t-Test

In [None]:
ds_collection_er = {}
ds_collection_er_2 = {}
import random
for i in range(30):
    ds_collection_er[i] = x_er.sample(n=int(((len(x_er.index)-1)/200)),random_state=random.randint(1,500))
    ds_collection_er_2[i] = ds_collection_er[i].copy()

ds_collection_er_2[0].info()

In [198]:
ds_collection = {}
for i in range(30):
    ds_collection[i] = ds_collection_er[i].drop(['ER_CondPr','RepSrc'], axis=1)

In [199]:
y_collection = {}
ds_one_h_collection = {}



for i in range(30):
    ds_collection[i].dropna(inplace=True)
    col_event = ds_collection[i]['death'].map({0: False, 1: True})
    col_time = ds_collection[i]['d.time']
    y_collection[i] = np.empty(dtype=[('col_event', bool),
                    ('col_time', np.float64)], shape=ds_collection[i].shape[0])

    y_collection[i]['col_event'] = col_event.values
    y_collection[i]['col_time'] = col_time.values

  
    ds_collection[i].drop(['d.time', 'death'], axis=1, inplace=True)

    ds_one_h_collection[i] = pd.get_dummies(ds_collection[i])
    #ds_one_h_collection_2[i] = pd.get_dummies(ds_collection_2[i])

In [None]:

estimator1 = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection[1],y_collection[1])
print(estimator1.score(ds_one_h_collection[1],y_collection[1]))


estimator2 = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection[2],y_collection[2])
print(estimator2.score(ds_one_h_collection[2],y_collection[2]))

In [201]:
estimator_collection = {}
score_collection = {}
for i in range(30):
    #print(i,end=' ')
    estimator_collection[i] = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection[i],y_collection[i])
    score_collection[i] = estimator_collection[i].score(ds_one_h_collection[i],y_collection[i])
    #print(score_collection[i])


In [None]:
df = pd.DataFrame(list(score_collection.items()),columns=['index','score'])
df.head()

# t-test with StateESR

In [None]:
ds_collection_er_2[1].info()

In [204]:
y_collection_er = {}
ds_one_h_collection_er = {}
ds_one_h_collection_er_2 = {}
for i in range(30):
    ds_collection_er[i].dropna(inplace=True)
    col_event = ds_collection_er[i]['death'].map({0: False, 1: True})
    col_time = ds_collection_er[i]['d.time']
    y_collection_er[i] = np.empty(dtype=[('col_event', bool),
                    ('col_time', np.float64)], shape=ds_collection_er[i].shape[0])

    y_collection_er[i]['col_event'] = col_event.values
    y_collection_er[i]['col_time'] = col_time.values

  
    ds_collection_er[i].drop(['d.time', 'death'], axis=1, inplace=True)
    
    ds_one_h_collection_er[i] = pd.get_dummies(ds_collection_er[i])
    ds_one_h_collection_er_2[i] = pd.get_dummies(ds_collection_er_2[i])

In [None]:

estimator1 = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection_er[1],y_collection_er[1])
print(estimator1.score(ds_one_h_collection_er[1],y_collection_er[1]))

estimator2 = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection_er[2],y_collection_er[2])
print(estimator2.score(ds_one_h_collection_er[2],y_collection_er[2]))

In [207]:
estimator_collection_er = {}
score_collection_er = {}
for i in range(30):
    #print(i,end=' ')
    estimator_collection_er[i] = CoxPHSurvivalAnalysis(alpha=10).fit(ds_one_h_collection_er[i],y_collection_er[i])
    score_collection_er[i] = estimator_collection_er[i].score(ds_one_h_collection_er[i],y_collection_er[i])
    #print(score_collection_er[i])


In [None]:
df_er = pd.DataFrame(list(score_collection_er.items()),columns=['index','score_er'])
df_er.head()

In [209]:
df_eval = pd.merge(df,df_er,on='index')
df_eval.drop(['index'],axis=1,inplace=True)
df_eval.head()
df_eval.to_csv(f't-Test Results/eval_results_{state}.csv')

In [None]:
df_eval['score_diff'] = df_eval['score_er'] - df_eval['score']
df_eval['score_diff'] = df_eval['score_diff'].abs()
print("Lo",df_eval['score_diff'].min())
print("Hi",df_eval['score_diff'].max())

In [318]:
from scipy.stats import ttest_rel
statistic, pvalue = ttest_rel(df_eval['score'],df_eval['score_er'])


import csv
with open('t-Test Results/ttest_result.csv', 'a') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([state,statistic,pvalue])