In [3]:
import pandas as pd

# Download the CSV file
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTjIHwQmGlGPRC-n1Dlm8wBGEwtyZ1jS5EqbB4Os7kfl73G6TsO-pcVcY8E38qScOhh9rPz0n0Y_4ge/pub?output=csv'
df = pd.read_csv(url)

# Update the values
mapping = {
    1: 'branca',
    2: 'preta',
    3: 'parda',
    4: 'amarela',
    5: 'indígena',
    99: 'sem informação'
}

df['racacor'] = df['racacor'].map(mapping)

# Save the modified DataFrame as a new CSV file
output_file = 'modified_data.csv'
df.to_csv(output_file, index=False)

print(f"The modified data has been saved to {output_file}.")


The modified data has been saved to modified_data.csv.


In [4]:
# Update the values
mapping = {
    1: 'cirurgia oncológica',
    2: 'quimioterapia',
    3: 'radioterapia',
    4: 'diagnóstico',
    5: 'outros de oncologia',
    6: 'outras cirurgias com os CIDs de câncer "C" ou "D"'
}

df['tipo_proc_min'] = df['tipo_proc_min'].map(mapping)

In [5]:
df

Unnamed: 0,sexo,racacor,qt_apac,qt_aih,obito,estadio,idade,tipo_proc_min,cidpri_min,n_comorb,...,tempo_internacao,tempo_coorte,estadio_rec,f_org_tto_min_rec,perfil_rec,cir_proc_tto_min_rec,time_diag,macro_tto,macro_pcn,estado_estabel
0,M,parda,1,4,1,4,57,outros de oncologia,C349,6,...,50,3,Doença Avançada,Cirurgia,Cirur + Quimio,LINFADENECTOMIA,Depois,Dentro,Centro,MG
1,M,branca,2,2,1,4,51,quimioterapia,C342,4,...,7,6,Doença Avançada,Quimioterapia,Quimioterapia,,Antes,Fora,Triângulo do Sul,SP
2,M,branca,2,8,1,2,64,outros de oncologia,C341,6,...,79,2,Doença Precoce,Quimioterapia,Quimio + Radio,,Depois,Dentro,Centro,MG
3,F,branca,1,1,1,4,75,quimioterapia,C349,4,...,4,3,Doença Avançada,Quimioterapia,Quimioterapia,,Antes,Dentro,Centro,MG
4,M,sem informação,4,1,1,3,68,quimioterapia,C349,1,...,4,7,Doença Avançada,Quimioterapia,Quimio + Radio,,Antes,Dentro,Sudeste,MG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,F,sem informação,2,0,0,3,54,quimioterapia,C349,1,...,0,6,Doença Avançada,Quimioterapia,Quimioterapia,,Antes,Dentro,Centro,MG
1028,M,parda,1,2,1,3,72,quimioterapia,C349,4,...,3,3,Doença Avançada,Quimioterapia,Quimioterapia,,Antes,Dentro,Leste,MG
1029,F,branca,9,0,0,4,49,radioterapia,C348,1,...,0,32,Doença Avançada,Radioterapia,Quimio + Radio,,Antes,Dentro,Centro,MG
1030,M,sem informação,3,6,1,3,67,quimioterapia,C340,8,...,33,26,Doença Avançada,Quimioterapia,Quimioterapia,,Antes,Fora,Jequitinhonha,MG


In [6]:
print(df.loc[1])

sexo                                   M
racacor                           branca
qt_apac                                2
qt_aih                                 2
obito                                  1
estadio                                4
idade                                 51
tipo_proc_min              quimioterapia
cidpri_min                          C342
n_comorb                               4
cid_tto_min                         C342
tipo_tto_min                           2
tempo_diag_tto_min                    47
tempo_internacao                       7
tempo_coorte                           6
estadio_rec              Doença Avançada
f_org_tto_min_rec          Quimioterapia
perfil_rec                 Quimioterapia
cir_proc_tto_min_rec                 NaN
time_diag                          Antes
macro_tto                           Fora
macro_pcn               Triângulo do Sul
estado_estabel                        SP
Name: 1, dtype: object


In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
df['cir_proc_tto_min_rec'] = df['cir_proc_tto_min_rec'].replace('CIRURGIAS MULTIPLAS E SEQUENCIAIS', 'multiplas_cir_seq')

# Convert categorical variables to one-hot encoded columns
categorical_cols = ['sexo','cir_proc_tto_min_rec', 'racacor', 'estadio', 'tipo_proc_min', 'cidpri_min', 'cid_tto_min', 'tipo_tto_min', 'estadio_rec', 'f_org_tto_min_rec', 'perfil_rec', 'time_diag', 'macro_tto', 'macro_pcn', 'estado_estabel']  
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Convert numeric variables to the correct data type
numeric_cols = ['qt_apac', 'qt_aih', 'idade', 'n_comorb', 'tempo_diag_tto_min', 'tempo_internacao', 'tempo_coorte']  
df_encoded[numeric_cols] = df_encoded[numeric_cols].astype(float)

# Apply Z-score normalization to numeric columns
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

# Prepare the data
X = df_encoded.drop('obito', axis=1)  # Features
y = df_encoded['obito']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=9)

# Train the logistic regression model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.68


In [69]:
import xgboost as xgb

# Create XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='logloss')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.73


In [41]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Define the parameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create a base model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='logloss')

# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Get the best score (using the 'accuracy' metric as we've specified)
print("Best accuracy found: ", grid_search.best_score_)

# Train the best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Get feature importances
importances = best_model.feature_importances_
feature_names = X_train.columns

# Print feature importances
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")


Best parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best accuracy found:  0.7575757575757575
qt_apac: 0.0285
qt_aih: 0.0427
idade: 0.0243
n_comorb: 0.0540
tempo_diag_tto_min: 0.0239
tempo_internacao: 0.0292
tempo_coorte: 0.0628
sexo_F: 0.0268
sexo_M: 0.0000
cir_proc_tto_min_rec_LINFADENECTOMIA: 0.0000
cir_proc_tto_min_rec_LOBECTOMIA: 0.0000
cir_proc_tto_min_rec_PNEUMOMECTOMIA: 0.0000
cir_proc_tto_min_rec_RESSECÇÃO DE TUMOR DO MEDIASTINO: 0.0000
cir_proc_tto_min_rec_RESSECÇÃO EM CUNHA: 0.0000
cir_proc_tto_min_rec_multiplas_cir_seq: 0.0000
racacor_amarela: 0.0067
racacor_branca: 0.0055
racacor_indígena: 0.0000
racacor_parda: 0.0000
racacor_preta: 0.0057
racacor_sem informação: 0.0266
estadio_1: 0.0000
estadio_2: 0.0310
estadio_3: 0.0000
estadio_4: 0.0270
tipo_proc_min_cirurgia oncológica: 0.0000
tipo_proc_min_diagnóstico: 0.0000
tipo_proc_min_outras cirurgias com os CIDs de câncer "C" ou "D": 0.0000
tipo_proc_min_outros de oncologia: 0.0296
tipo_proc_min_q

In [37]:
len(y_test)

207

In [70]:
# Get feature names
feature_names = X.columns

# Get the coefficients
coefficients = model.coef_[0]

# Combine feature names and coefficients into a DataFrame
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate absolute values of coefficients for ranking
coef_df['abs_coef'] = abs(coef_df['Coefficient'])

# Sort the DataFrame by absolute value of coefficient (from high to low)
coef_df = coef_df.sort_values(by='abs_coef', ascending=False)

# Print the DataFrame
print(coef_df)


AttributeError: Coefficients are not defined for Booster type None

In [None]:
import numpy as np

# Get feature names
feature_names = X.columns

# Get the coefficients
coefficients = model.coef_[0]

# Calculate the odds ratios from the coefficients
odds_ratios = np.exp(coefficients)

# Combine feature names, coefficients and odds ratios into a DataFrame
coef_odds_df = pd.DataFrame({
    'Feature': feature_names, 
    'Coefficient': coefficients, 
    'Odds_Ratio': odds_ratios,
    'abs_coef': abs(coefficients),
    })

# Sort the DataFrame by absolute value of coefficient (from high to low)
coef_odds_df = coef_odds_df.sort_values(by='abs_coef', ascending=False)

# Use pandas option to display all rows
pd.set_option('display.max_rows', None)

# Print the DataFrame
print(coef_odds_df)


In [60]:
df=coef_odds_df

In [61]:
# Filter features with positive coefficients
positive_coef = df[df['Coefficient'] > 0]
positive_coef_sorted = positive_coef.sort_values(by='abs_coef', ascending=False)

# Filter features with negative coefficients
negative_coef = df[df['Coefficient'] < 0]
negative_coef_sorted = negative_coef.sort_values(by='abs_coef', ascending=False)

# Print features with positive coefficients
print("Features with positive coefficients:")
print(positive_coef_sorted)

# Print features with negative coefficients
print("Features with negative coefficients:")
print(negative_coef_sorted)



Features with positive coefficients:
                                              Feature  Coefficient  \
68                                    time_diag_Antes     0.740617   
73                               macro_pcn_Centro Sul     0.719079   
67                            perfil_rec_Radioterapia     0.669717   
44                                   cid_tto_min_C341     0.635375   
47                                   cid_tto_min_C348     0.560785   
35                                    cidpri_min_C343     0.526151   
17                                   racacor_indígena     0.476638   
32                                    cidpri_min_C340     0.458015   
79                                    macro_pcn_Norte     0.355486   
12  cir_proc_tto_min_rec_RESSECÇÃO DE TUMOR DO MED...     0.351232   
1                                              qt_aih     0.345481   
9                cir_proc_tto_min_rec_LINFADENECTOMIA     0.344668   
21                                          estadio_1

In [62]:
# Save features with positive coefficients to a CSV file
positive_coef_sorted.to_csv('features_positive_coef.csv', index=False)

# Save features with negative coefficients to a CSV file
negative_coef_sorted.to_csv('features_negative_coef.csv', index=False)
