# ***Making the best Model using everything***

# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import random as random
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, r2_score 
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, plot_precision_recall_curve

#Reading dataset

In [2]:
path = '/content/drive/MyDrive/Data 101 DSC/Datathon/competition data/train.csv'
data_train = pd.read_csv(path)
data_train.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,21388,6,10452,DX1 DX4,23,19.199662,YES,RURAL,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,1990,10,5807,DX1 DX5,65,18.354031,YES,URBAN,Stable,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0
2,16710,43,11802,DX6,2,24.73277,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,29293,32,6863,DX2 DX3,57,22.204759,NO,URBAN,Stable,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,1
4,24007,14,1495,DX6,66,23.883381,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


# Copying and Cleaning

In [3]:
data = data_train.copy()

In [4]:
#Duplicated Check
data.duplicated().sum()

0

In [5]:
#Null values check
data.isnull().sum(axis = 0)

ID_Patient_Care_Situation       0
Diagnosed_Condition             0
Patient_ID                      0
Treated_with_drugs              8
Patient_Age                     0
Patient_Body_Mass_Index         0
Patient_Smoker                  0
Patient_Rural_Urban             0
Patient_mental_condition        0
A                            1005
B                            1005
C                            1005
D                            1005
E                            1005
F                            1005
Z                            1005
Number_of_prev_cond          1005
Survived_1_year                 0
dtype: int64

In [6]:
data['A'].fillna(data_train['A'].mode()[0], inplace=True)
data['B'].fillna(data_train['B'].mode()[0], inplace=True)
data['C'].fillna(data_train['C'].mode()[0], inplace=True)
data['D'].fillna(data_train['D'].mode()[0], inplace=True)
data['E'].fillna(data_train['E'].mode()[0], inplace=True)
data['F'].fillna(data_train['F'].mode()[0], inplace=True)
data['Z'].fillna(data_train['Z'].mode()[0], inplace=True)
data['Number_of_prev_cond'].fillna(data_train['Number_of_prev_cond'].mode()[0], inplace=True)
data['Treated_with_drugs'].fillna(data_train['Treated_with_drugs'].mode()[0], inplace=True)

In [7]:
data.isnull().sum(axis=0)

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
Survived_1_year              0
dtype: int64

In [8]:
data['Patient_Smoker'] = data['Patient_Smoker'].apply(lambda x: 1 if x == 'YES' else 0)
data['Patient_Rural_Urban'] = data['Patient_Rural_Urban'].apply(lambda x: 1 if x == 'URBAN' else 0)
data['Patient_mental_condition'] = data['Patient_mental_condition'].apply(lambda x: 1 if x == 'Stable' else 0)

In [9]:
data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,21388,6,10452,DX1 DX4,23,19.199662,1,0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,1990,10,5807,DX1 DX5,65,18.354031,1,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0
2,16710,43,11802,DX6,2,24.73277,0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,29293,32,6863,DX2 DX3,57,22.204759,0,1,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,1
4,24007,14,1495,DX6,66,23.883381,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


# One Hot Encoding

In [10]:
data = pd.get_dummies(data,'Treated_with_drugs')
data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year,Treated_with_drugs_DX1,Treated_with_drugs_DX1 DX2,Treated_with_drugs_DX1 DX2 DX3,Treated_with_drugs_DX1 DX2 DX3 DX4,Treated_with_drugs_DX1 DX2 DX3 DX4 DX5,Treated_with_drugs_DX1 DX2 DX3 DX5,Treated_with_drugs_DX1 DX2 DX4,Treated_with_drugs_DX1 DX2 DX4 DX5,Treated_with_drugs_DX1 DX2 DX5,Treated_with_drugs_DX1 DX3,Treated_with_drugs_DX1 DX3 DX4,Treated_with_drugs_DX1 DX3 DX4 DX5,Treated_with_drugs_DX1 DX3 DX5,Treated_with_drugs_DX1 DX4,Treated_with_drugs_DX1 DX4 DX5,Treated_with_drugs_DX1 DX5,Treated_with_drugs_DX2,Treated_with_drugs_DX2 DX3,Treated_with_drugs_DX2 DX3 DX4,Treated_with_drugs_DX2 DX3 DX4 DX5,Treated_with_drugs_DX2 DX3 DX5,Treated_with_drugs_DX2 DX4,Treated_with_drugs_DX2 DX4 DX5,Treated_with_drugs_DX2 DX5,Treated_with_drugs_DX3,Treated_with_drugs_DX3 DX4,Treated_with_drugs_DX3 DX4 DX5,Treated_with_drugs_DX3 DX5,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6
0,21388,6,10452,23,19.199662,1,0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1990,10,5807,65,18.354031,1,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,16710,43,11802,2,24.73277,0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,29293,32,6863,57,22.204759,0,1,1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,24007,14,1495,66,23.883381,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [11]:
data.isnull().sum()

ID_Patient_Care_Situation                  0
Diagnosed_Condition                        0
Patient_ID                                 0
Patient_Age                                0
Patient_Body_Mass_Index                    0
Patient_Smoker                             0
Patient_Rural_Urban                        0
Patient_mental_condition                   0
A                                          0
B                                          0
C                                          0
D                                          0
E                                          0
F                                          0
Z                                          0
Number_of_prev_cond                        0
Survived_1_year                            0
Treated_with_drugs_DX1                     0
Treated_with_drugs_DX1 DX2                 0
Treated_with_drugs_DX1 DX2 DX3             0
Treated_with_drugs_DX1 DX2 DX3 DX4         0
Treated_with_drugs_DX1 DX2 DX3 DX4 DX5     0
Treated_wi

# Scaling

In [12]:
X = data.drop(columns = ['Survived_1_year'])
Y = data.Survived_1_year 
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data=scaled_features)
scaled_features.columns= X.columns
scaled_features.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Treated_with_drugs_DX1,Treated_with_drugs_DX1 DX2,Treated_with_drugs_DX1 DX2 DX3,Treated_with_drugs_DX1 DX2 DX3 DX4,Treated_with_drugs_DX1 DX2 DX3 DX4 DX5,Treated_with_drugs_DX1 DX2 DX3 DX5,Treated_with_drugs_DX1 DX2 DX4,Treated_with_drugs_DX1 DX2 DX4 DX5,Treated_with_drugs_DX1 DX2 DX5,Treated_with_drugs_DX1 DX3,Treated_with_drugs_DX1 DX3 DX4,Treated_with_drugs_DX1 DX3 DX4 DX5,Treated_with_drugs_DX1 DX3 DX5,Treated_with_drugs_DX1 DX4,Treated_with_drugs_DX1 DX4 DX5,Treated_with_drugs_DX1 DX5,Treated_with_drugs_DX2,Treated_with_drugs_DX2 DX3,Treated_with_drugs_DX2 DX3 DX4,Treated_with_drugs_DX2 DX3 DX4 DX5,Treated_with_drugs_DX2 DX3 DX5,Treated_with_drugs_DX2 DX4,Treated_with_drugs_DX2 DX4 DX5,Treated_with_drugs_DX2 DX5,Treated_with_drugs_DX3,Treated_with_drugs_DX3 DX4,Treated_with_drugs_DX3 DX4 DX5,Treated_with_drugs_DX3 DX5,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6
0,0.505858,-1.364683,1.164159,-0.521228,-1.116737,1.157859,-0.655067,0.0,-3.078066,-0.385629,-0.462854,-0.291164,1.305405,-0.231389,-0.020812,-0.924474,-0.295408,-0.142157,-0.06511,-0.031227,-0.012743,-0.026534,-0.067174,-0.02944,-0.066356,-0.134851,-0.056115,-0.032918,-0.065528,7.605521,-0.060327,-0.133177,-0.301538,-0.131057,-0.059873,-0.030347,-0.064267,-0.132333,-0.061224,-0.135681,-0.294216,-0.142157,-0.063412,-0.136506,-0.290179,-0.139156,-0.298107,-0.76934
1,-1.530762,-1.098336,-0.127553,1.626549,-1.338946,1.157859,1.526561,0.0,-3.078066,2.593164,-0.462854,-0.291164,1.305405,-0.231389,-0.020812,0.378265,-0.295408,-0.142157,-0.06511,-0.031227,-0.012743,-0.026534,-0.067174,-0.02944,-0.066356,-0.134851,-0.056115,-0.032918,-0.065528,-0.131483,-0.060327,7.508794,-0.301538,-0.131057,-0.059873,-0.030347,-0.064267,-0.132333,-0.061224,-0.135681,-0.294216,-0.142157,-0.063412,-0.136506,-0.290179,-0.139156,-0.298107,-0.76934
2,0.014709,1.099031,1.539576,-1.595117,0.337212,-0.863663,-0.655067,0.0,0.324879,-0.385629,-0.462854,-0.291164,-0.766046,-0.231389,-0.020812,-0.924474,-0.295408,-0.142157,-0.06511,-0.031227,-0.012743,-0.026534,-0.067174,-0.02944,-0.066356,-0.134851,-0.056115,-0.032918,-0.065528,-0.131483,-0.060327,-0.133177,-0.301538,-0.131057,-0.059873,-0.030347,-0.064267,-0.132333,-0.061224,-0.135681,-0.294216,-0.142157,-0.063412,-0.136506,-0.290179,-0.139156,-0.298107,1.299815
3,1.335814,0.366576,0.166106,1.217449,-0.32708,-0.863663,1.526561,0.0,0.324879,2.593164,-0.462854,3.434488,-0.766046,-0.231389,-0.020812,1.681003,-0.295408,-0.142157,-0.06511,-0.031227,-0.012743,-0.026534,-0.067174,-0.02944,-0.066356,-0.134851,-0.056115,-0.032918,-0.065528,-0.131483,-0.060327,-0.133177,-0.301538,7.630279,-0.059873,-0.030347,-0.064267,-0.132333,-0.061224,-0.135681,-0.294216,-0.142157,-0.063412,-0.136506,-0.290179,-0.139156,-0.298107,-0.76934
4,0.78083,-0.831988,-1.326662,1.677687,0.114016,1.157859,-0.655067,0.0,0.324879,-0.385629,-0.462854,-0.291164,-0.766046,-0.231389,-0.020812,-0.924474,-0.295408,-0.142157,-0.06511,-0.031227,-0.012743,-0.026534,-0.067174,-0.02944,-0.066356,-0.134851,-0.056115,-0.032918,-0.065528,-0.131483,-0.060327,-0.133177,-0.301538,-0.131057,-0.059873,-0.030347,-0.064267,-0.132333,-0.061224,-0.135681,-0.294216,-0.142157,-0.063412,-0.136506,-0.290179,-0.139156,-0.298107,1.299815


# Splitting

In [13]:
X_train = scaled_features
Y_train = Y

In [14]:
X_train.shape

(18477, 48)

In [15]:
Y_train.shape

(18477,)

# Upsampling

In [16]:
from sklearn.utils import resample
train_data = pd.concat([X_train, Y_train], axis = 1)
not_survived = train_data[train_data.Survived_1_year==0]
survived = train_data[train_data.Survived_1_year==1]
not_survived_upsampled = resample(not_survived,
                           replace = True, # Sample with replacement
                           n_samples = len(survived), # Match number in majority class
                           random_state=27)
upsampled = pd.concat([survived, not_survived_upsampled])
upsampled.Survived_1_year.value_counts()

1    11682
0    11682
Name: Survived_1_year, dtype: int64

# Omitting Irrelevant columns

In [17]:
X_train.columns

Index(['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'Patient_Smoker',
       'Patient_Rural_Urban', 'Patient_mental_condition', 'A', 'B', 'C', 'D',
       'E', 'F', 'Z', 'Number_of_prev_cond', 'Treated_with_drugs_DX1 ',
       'Treated_with_drugs_DX1 DX2 ', 'Treated_with_drugs_DX1 DX2 DX3 ',
       'Treated_with_drugs_DX1 DX2 DX3 DX4 ',
       'Treated_with_drugs_DX1 DX2 DX3 DX4 DX5 ',
       'Treated_with_drugs_DX1 DX2 DX3 DX5 ',
       'Treated_with_drugs_DX1 DX2 DX4 ',
       'Treated_with_drugs_DX1 DX2 DX4 DX5 ',
       'Treated_with_drugs_DX1 DX2 DX5 ', 'Treated_with_drugs_DX1 DX3 ',
       'Treated_with_drugs_DX1 DX3 DX4 ',
       'Treated_with_drugs_DX1 DX3 DX4 DX5 ',
       'Treated_with_drugs_DX1 DX3 DX5 ', 'Treated_with_drugs_DX1 DX4 ',
       'Treated_with_drugs_DX1 DX4 DX5 ', 'Treated_with_drugs_DX1 DX5 ',
       'Treated_with_drugs_DX2 ', 'Treated_with_drugs_DX2 DX3 ',
       'Treated_with_drugs_DX2 DX3 

In [18]:
X_train = X_train.drop(columns=['ID_Patient_Care_Situation','Patient_ID','Treated_with_drugs_DX1 DX2 DX3 DX4 DX5 '])

# Reading Test Data

In [19]:
path_test = '/content/drive/MyDrive/Data 101 DSC/Datathon/competition data/test.csv'
data_test = pd.read_csv(path_test)
data_test.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,9913,2,10247,DX6,7,23.489639,NO,RURAL,Stable,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0
1,20681,40,10919,DX6,66,21.941882,NO,URBAN,Stable,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2264,18,5283,DX3,3,24.579787,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,21587,40,7442,DX1 DX4,9,17.832771,NO,URBAN,Stable,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
4,15876,46,3845,DX1 DX3,64,28.829848,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0


# Copying and Processing data

In [20]:
df_test = data_test.copy()

In [21]:
df_test.isnull().sum(axis =0)

ID_Patient_Care_Situation      0
Diagnosed_Condition            0
Patient_ID                     0
Treated_with_drugs             5
Patient_Age                    0
Patient_Body_Mass_Index        0
Patient_Smoker                 0
Patient_Rural_Urban            0
Patient_mental_condition       0
A                            230
B                            230
C                            230
D                            230
E                            230
F                            230
Z                            230
Number_of_prev_cond          230
dtype: int64

In [22]:
df_test['A'].fillna(data_test['A'].mode()[0], inplace=True)
df_test['B'].fillna(data_test['B'].mode()[0], inplace=True)
df_test['C'].fillna(data_test['C'].mode()[0], inplace=True)
df_test['D'].fillna(data_test['D'].mode()[0], inplace=True)
df_test['E'].fillna(data_test['E'].mode()[0], inplace=True)
df_test['F'].fillna(data_test['F'].mode()[0], inplace=True)
df_test['Z'].fillna(data_test['Z'].mode()[0], inplace=True)
df_test['Number_of_prev_cond'].fillna(data_test['Number_of_prev_cond'].mode()[0], inplace=True)
df_test['Treated_with_drugs'].fillna(data_test['Treated_with_drugs'].mode()[0], inplace=True)

In [23]:
df_test.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,9913,2,10247,DX6,7,23.489639,NO,RURAL,Stable,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0
1,20681,40,10919,DX6,66,21.941882,NO,URBAN,Stable,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2264,18,5283,DX3,3,24.579787,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,21587,40,7442,DX1 DX4,9,17.832771,NO,URBAN,Stable,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
4,15876,46,3845,DX1 DX3,64,28.829848,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0


In [24]:
df_test['Patient_Smoker'] = data_test['Patient_Smoker'].apply(lambda x: 1 if x == 'YES' else 0)
df_test['Patient_Rural_Urban'] = data_test['Patient_Rural_Urban'].apply(lambda x: 1 if x == 'URBAN' else 0)
df_test['Patient_mental_condition'] = data_test['Patient_mental_condition'].apply(lambda x: 1 if x == 'Stable' else 0)

In [25]:
df_test.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,9913,2,10247,DX6,7,23.489639,0,0,1,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0
1,20681,40,10919,DX6,66,21.941882,0,1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2264,18,5283,DX3,3,24.579787,0,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,21587,40,7442,DX1 DX4,9,17.832771,0,1,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
4,15876,46,3845,DX1 DX3,64,28.829848,1,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0


In [26]:
df_test.isnull().sum(axis =0)

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
dtype: int64

In [27]:
final_df_test= pd.get_dummies(df_test,'Treated_with_drugs')

In [28]:
final_df_test.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Treated_with_drugs_DX1,Treated_with_drugs_DX1 DX2,Treated_with_drugs_DX1 DX2 DX3,Treated_with_drugs_DX1 DX2 DX3 DX4,Treated_with_drugs_DX1 DX2 DX3 DX5,Treated_with_drugs_DX1 DX2 DX4,Treated_with_drugs_DX1 DX2 DX4 DX5,Treated_with_drugs_DX1 DX2 DX5,Treated_with_drugs_DX1 DX3,Treated_with_drugs_DX1 DX3 DX4,Treated_with_drugs_DX1 DX3 DX4 DX5,Treated_with_drugs_DX1 DX3 DX5,Treated_with_drugs_DX1 DX4,Treated_with_drugs_DX1 DX4 DX5,Treated_with_drugs_DX1 DX5,Treated_with_drugs_DX2,Treated_with_drugs_DX2 DX3,Treated_with_drugs_DX2 DX3 DX4,Treated_with_drugs_DX2 DX3 DX4 DX5,Treated_with_drugs_DX2 DX3 DX5,Treated_with_drugs_DX2 DX4,Treated_with_drugs_DX2 DX4 DX5,Treated_with_drugs_DX2 DX5,Treated_with_drugs_DX3,Treated_with_drugs_DX3 DX4,Treated_with_drugs_DX3 DX4 DX5,Treated_with_drugs_DX3 DX5,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6
0,9913,2,10247,7,23.489639,0,0,1,1.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,20681,40,10919,66,21.941882,0,1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2264,18,5283,3,24.579787,0,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,21587,40,7442,9,17.832771,0,1,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,15876,46,3845,64,28.829848,1,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
final_df_test.columns

Index(['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'Patient_Smoker',
       'Patient_Rural_Urban', 'Patient_mental_condition', 'A', 'B', 'C', 'D',
       'E', 'F', 'Z', 'Number_of_prev_cond', 'Treated_with_drugs_DX1 ',
       'Treated_with_drugs_DX1 DX2 ', 'Treated_with_drugs_DX1 DX2 DX3 ',
       'Treated_with_drugs_DX1 DX2 DX3 DX4 ',
       'Treated_with_drugs_DX1 DX2 DX3 DX5 ',
       'Treated_with_drugs_DX1 DX2 DX4 ',
       'Treated_with_drugs_DX1 DX2 DX4 DX5 ',
       'Treated_with_drugs_DX1 DX2 DX5 ', 'Treated_with_drugs_DX1 DX3 ',
       'Treated_with_drugs_DX1 DX3 DX4 ',
       'Treated_with_drugs_DX1 DX3 DX4 DX5 ',
       'Treated_with_drugs_DX1 DX3 DX5 ', 'Treated_with_drugs_DX1 DX4 ',
       'Treated_with_drugs_DX1 DX4 DX5 ', 'Treated_with_drugs_DX1 DX5 ',
       'Treated_with_drugs_DX2 ', 'Treated_with_drugs_DX2 DX3 ',
       'Treated_with_drugs_DX2 DX3 DX4 ',
       'Treated_with_drugs_DX2 DX3 DX4 DX5 

In [30]:
final_df_test = final_df_test.drop(columns=['ID_Patient_Care_Situation', 'Patient_ID'])

In [31]:
final_df_test.shape

(4620, 45)

In [32]:
X_test = final_df_test.copy()

In [33]:
X_test.shape

(4620, 45)

In [62]:
X = X_test
Y = data.Survived_1_year 
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data=scaled_features)
scaled_features.columns= X.columns
scaled_features.head()

Unnamed: 0,Diagnosed_Condition,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Treated_with_drugs_DX1,Treated_with_drugs_DX1 DX2,Treated_with_drugs_DX1 DX2 DX3,Treated_with_drugs_DX1 DX2 DX3 DX4,Treated_with_drugs_DX1 DX2 DX3 DX5,Treated_with_drugs_DX1 DX2 DX4,Treated_with_drugs_DX1 DX2 DX4 DX5,Treated_with_drugs_DX1 DX2 DX5,Treated_with_drugs_DX1 DX3,Treated_with_drugs_DX1 DX3 DX4,Treated_with_drugs_DX1 DX3 DX4 DX5,Treated_with_drugs_DX1 DX3 DX5,Treated_with_drugs_DX1 DX4,Treated_with_drugs_DX1 DX4 DX5,Treated_with_drugs_DX1 DX5,Treated_with_drugs_DX2,Treated_with_drugs_DX2 DX3,Treated_with_drugs_DX2 DX3 DX4,Treated_with_drugs_DX2 DX3 DX4 DX5,Treated_with_drugs_DX2 DX3 DX5,Treated_with_drugs_DX2 DX4,Treated_with_drugs_DX2 DX4 DX5,Treated_with_drugs_DX2 DX5,Treated_with_drugs_DX3,Treated_with_drugs_DX3 DX4,Treated_with_drugs_DX3 DX4 DX5,Treated_with_drugs_DX3 DX5,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6
0,-1.597647,-1.34575,0.003536,-0.852325,-0.664453,0.0,0.335733,-0.382253,2.208588,3.321333,1.272418,-0.231499,-0.032915,2.96781,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,1.281214
1,0.922868,1.675745,-0.402112,-0.852325,1.504998,0.0,0.335733,2.616066,-0.452778,-0.301084,-0.785905,-0.231499,-0.032915,0.372205,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,1.281214
2,-0.536378,-1.550597,0.28925,-0.852325,1.504998,0.0,0.335733,-0.382253,-0.452778,-0.301084,-0.785905,-0.231499,-0.032915,-0.925597,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,3.450366,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051
3,0.922868,-1.243327,-1.479059,-0.852325,1.504998,0.0,0.335733,-0.382253,2.208588,3.321333,-0.785905,-0.231499,-0.032915,1.670007,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,6.938943,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051
4,1.320844,1.573321,1.403138,1.173262,-0.664453,0.0,0.335733,-0.382253,-0.452778,-0.301084,1.272418,-0.231499,-0.032915,0.372205,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,6.938943,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051


In [63]:
X_test = scaled_features.copy()

In [64]:
X_test.head()

Unnamed: 0,Diagnosed_Condition,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Treated_with_drugs_DX1,Treated_with_drugs_DX1 DX2,Treated_with_drugs_DX1 DX2 DX3,Treated_with_drugs_DX1 DX2 DX3 DX4,Treated_with_drugs_DX1 DX2 DX3 DX5,Treated_with_drugs_DX1 DX2 DX4,Treated_with_drugs_DX1 DX2 DX4 DX5,Treated_with_drugs_DX1 DX2 DX5,Treated_with_drugs_DX1 DX3,Treated_with_drugs_DX1 DX3 DX4,Treated_with_drugs_DX1 DX3 DX4 DX5,Treated_with_drugs_DX1 DX3 DX5,Treated_with_drugs_DX1 DX4,Treated_with_drugs_DX1 DX4 DX5,Treated_with_drugs_DX1 DX5,Treated_with_drugs_DX2,Treated_with_drugs_DX2 DX3,Treated_with_drugs_DX2 DX3 DX4,Treated_with_drugs_DX2 DX3 DX4 DX5,Treated_with_drugs_DX2 DX3 DX5,Treated_with_drugs_DX2 DX4,Treated_with_drugs_DX2 DX4 DX5,Treated_with_drugs_DX2 DX5,Treated_with_drugs_DX3,Treated_with_drugs_DX3 DX4,Treated_with_drugs_DX3 DX4 DX5,Treated_with_drugs_DX3 DX5,Treated_with_drugs_DX4,Treated_with_drugs_DX4 DX5,Treated_with_drugs_DX5,Treated_with_drugs_DX6
0,-1.597647,-1.34575,0.003536,-0.852325,-0.664453,0.0,0.335733,-0.382253,2.208588,3.321333,1.272418,-0.231499,-0.032915,2.96781,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,1.281214
1,0.922868,1.675745,-0.402112,-0.852325,1.504998,0.0,0.335733,2.616066,-0.452778,-0.301084,-0.785905,-0.231499,-0.032915,0.372205,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,1.281214
2,-0.536378,-1.550597,0.28925,-0.852325,1.504998,0.0,0.335733,-0.382253,-0.452778,-0.301084,-0.785905,-0.231499,-0.032915,-0.925597,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,3.450366,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051
3,0.922868,-1.243327,-1.479059,-0.852325,1.504998,0.0,0.335733,-0.382253,2.208588,3.321333,-0.785905,-0.231499,-0.032915,1.670007,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,-0.144114,-0.083515,-0.029437,-0.069171,6.938943,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051
4,1.320844,1.573321,1.403138,1.173262,-0.664453,0.0,0.335733,-0.382253,-0.452778,-0.301084,1.272418,-0.231499,-0.032915,0.372205,-0.287183,-0.134423,-0.060772,-0.036061,-0.014714,-0.058951,-0.020811,-0.069171,6.938943,-0.083515,-0.029437,-0.069171,-0.144114,-0.053121,-0.132745,-0.292449,-0.137724,-0.067574,-0.032915,-0.057073,-0.149499,-0.057073,-0.120386,-0.289824,-0.134423,-0.069171,-0.123132,-0.289385,-0.125822,-0.308296,-0.78051


# Reading Submission file

Reading the submission file provides us with the dataframe that we can use as our predictions

In [34]:
sample_submission_data_path = '/content/drive/MyDrive/Data 101 DSC/Datathon/competition data/sampleSubmission.csv' 
y_test = pd.read_csv(sample_submission_data_path)

In [35]:
y_test.shape

(4620, 2)

In [36]:
y_test.columns

Index(['ID_Patient_Care_Situation', 'Survived_1_year'], dtype='object')

In [37]:
Y_test = y_test.drop(columns=['ID_Patient_Care_Situation'])

In [38]:
Y_test.shape

(4620, 1)

In [39]:
Y_test

Unnamed: 0,Survived_1_year
0,0
1,1
2,0
3,1
4,0
...,...
4615,1
4616,0
4617,1
4618,0


# Modelling

In [66]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,Y_train)
print("Accuracy of train:",dt.score(X_train,Y_train))
print("Accuracy of test:",dt.score(X_test,Y_test))

Accuracy of train: 0.9994587865995562
Accuracy of test: 0.49523809523809526


In [67]:
final_model = tree.DecisionTreeClassifier()
final_model = final_model.fit(X = X_train,y = Y_train)
predictions_tree_model = final_model.predict(X_test)
print(classification_report(Y_test, predictions_tree_model, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.49      0.36      0.41      2310
           1       0.50      0.63      0.56      2310

    accuracy                           0.49      4620
   macro avg       0.49      0.49      0.48      4620
weighted avg       0.49      0.49      0.48      4620



# Submission

In [68]:
predictions_tree_model

array([1, 1, 1, ..., 0, 1, 1])

In [69]:
pred_df = pd.DataFrame(predictions_tree_model)
pred_df

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
4615,0
4616,1
4617,0
4618,1


In [70]:
pred_df.value_counts()

1    2945
0    1675
dtype: int64

Making a submission file using the above DataFrame

In [71]:
df_submission_try2 = pd.DataFrame(columns=["ID_Patient_Care_Situation","Survived_1_year"])
df_submission_try2["ID_Patient_Care_Situation"] = y_test.ID_Patient_Care_Situation
df_submission_try2["Survived_1_year"] = pred_df
df_submission_try2

Unnamed: 0,ID_Patient_Care_Situation,Survived_1_year
0,9913,1
1,20681,1
2,2264,1
3,21587,1
4,15876,0
...,...,...
4615,18001,0
4616,24900,1
4617,10998,0
4618,25628,1


# Exporting

In [72]:
df_submission_try2.to_csv('/content/drive/MyDrive/Data 101 DSC/Datathon/try2_tree_submission.csv', index=False)