In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df = pd.read_csv('dataset_med.csv')
print(df.head())

   id   age  gender      country diagnosis_date cancer_stage family_history  \
0   1  64.0    Male       Sweden     2016-04-05      Stage I            Yes   
1   2  50.0  Female  Netherlands     2023-04-20    Stage III            Yes   
2   3  65.0  Female      Hungary     2023-04-05    Stage III            Yes   
3   4  51.0  Female      Belgium     2016-02-05      Stage I             No   
4   5  37.0    Male   Luxembourg     2023-11-29      Stage I             No   

   smoking_status   bmi  cholesterol_level  hypertension  asthma  cirrhosis  \
0  Passive Smoker  29.4                199             0       0          1   
1  Passive Smoker  41.2                280             1       1          0   
2   Former Smoker  44.0                268             1       1          0   
3  Passive Smoker  43.0                241             1       1          0   
4  Passive Smoker  19.7                178             0       0          0   

   other_cancer treatment_type end_treatment_date 

In [5]:
x = df.iloc[:, :-1]
print(x.head())

   id   age  gender      country diagnosis_date cancer_stage family_history  \
0   1  64.0    Male       Sweden     2016-04-05      Stage I            Yes   
1   2  50.0  Female  Netherlands     2023-04-20    Stage III            Yes   
2   3  65.0  Female      Hungary     2023-04-05    Stage III            Yes   
3   4  51.0  Female      Belgium     2016-02-05      Stage I             No   
4   5  37.0    Male   Luxembourg     2023-11-29      Stage I             No   

   smoking_status   bmi  cholesterol_level  hypertension  asthma  cirrhosis  \
0  Passive Smoker  29.4                199             0       0          1   
1  Passive Smoker  41.2                280             1       1          0   
2   Former Smoker  44.0                268             1       1          0   
3  Passive Smoker  43.0                241             1       1          0   
4  Passive Smoker  19.7                178             0       0          0   

   other_cancer treatment_type end_treatment_date 

In [7]:
y = df.iloc[:, -1]
print(y.head())

0    0
1    1
2    0
3    0
4    0
Name: survived, dtype: int64


In [9]:
x['diagnosis_date'] = pd.to_datetime(x['diagnosis_date'])
x['end_treatment_date'] = pd.to_datetime(x['end_treatment_date'])
x['treatment_duration_days'] = (x['end_treatment_date'] - x['diagnosis_date']).dt.days
x.drop(['diagnosis_date', 'end_treatment_date'], axis = 1, inplace = True) 
x.drop(['id'], axis = 1, inplace = True)
print(x.head())

    age  gender      country cancer_stage family_history  smoking_status  \
0  64.0    Male       Sweden      Stage I            Yes  Passive Smoker   
1  50.0  Female  Netherlands    Stage III            Yes  Passive Smoker   
2  65.0  Female      Hungary    Stage III            Yes   Former Smoker   
3  51.0  Female      Belgium      Stage I             No  Passive Smoker   
4  37.0    Male   Luxembourg      Stage I             No  Passive Smoker   

    bmi  cholesterol_level  hypertension  asthma  cirrhosis  other_cancer  \
0  29.4                199             0       0          1             0   
1  41.2                280             1       1          0             0   
2  44.0                268             1       1          0             0   
3  43.0                241             1       1          0             0   
4  19.7                178             0       0          0             0   

  treatment_type  treatment_duration_days  
0   Chemotherapy                    

In [11]:
le = LabelEncoder()
x.gender = le.fit_transform(x.gender)
x.country = le.fit_transform(x.country)
x.cancer_stage = le.fit_transform(x.cancer_stage)
x.family_history = le.fit_transform(x.family_history)
x.smoking_status = le.fit_transform(x.smoking_status)
x.treatment_type = le.fit_transform(x.treatment_type)
print(x.head())

    age  gender  country  cancer_stage  family_history  smoking_status   bmi  \
0  64.0       1       26             0               1               3  29.4   
1  50.0       0       19             2               1               3  41.2   
2  65.0       0       12             2               1               1  44.0   
3  51.0       0        1             0               0               3  43.0   
4  37.0       1       17             0               0               3  19.7   

   cholesterol_level  hypertension  asthma  cirrhosis  other_cancer  \
0                199             0       0          1             0   
1                280             1       1          0             0   
2                268             1       1          0             0   
3                241             1       1          0             0   
4                178             0       0          0             0   

   treatment_type  treatment_duration_days  
0               0                      523  
1 

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [37]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [63]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
model = XGBClassifier(random_state = 42 )
model.fit(x_train_resampled, y_train_resampled)

In [65]:
y_pred = model.predict(x_test)
print("The predicted values:", y_pred[:20])

The predicted values: [0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0]


In [43]:
print(y_test[:20])

86096     1
72172     0
575194    0
703991    1
864416    1
828393    0
527208    0
367988    0
39554     0
151073    1
574706    0
28205     1
739895    0
474227    0
196195    0
874130    0
396204    1
332059    0
407377    1
367539    0
Name: survived, dtype: int64


In [67]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7150112359550562


In [69]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83    138893
           1       0.22      0.12      0.15     39107

    accuracy                           0.72    178000
   macro avg       0.50      0.50      0.49    178000
weighted avg       0.66      0.72      0.68    178000

