# Kaggle Upload
How to upload predictions to Kaggle

## 1. Training Classifier on Hospital Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
hospital_df = pd.read_csv("C:/Users/Vic/Desktop/Data Scienece/Datasets-20231016/Dataset_Hospital_Visits.csv", index_col=None)

In [3]:
hospital_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88421 entries, 0 to 88420
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PatientId        88421 non-null  float64
 1   AppointmentID    88421 non-null  int64  
 2   Sex              88421 non-null  object 
 3   ScheduledDate    88421 non-null  object 
 4   AppointmentDate  88421 non-null  object 
 5   Age              79614 non-null  float64
 6   Community        77708 non-null  object 
 7   SocialWelfare    75902 non-null  object 
 8   Hipertension     80400 non-null  object 
 9   Diabetes         88421 non-null  object 
 10  Alcoholism       73532 non-null  object 
 11  Handcap          88421 non-null  object 
 12  SMS_received     88421 non-null  object 
 13  No-show          88421 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 9.4+ MB


In [4]:
# naive dummy coding approach. Your's should be better ;-)
hospital_df['Sex'] = pd.factorize(hospital_df['Sex'])[0]
hospital_df['Community'] = pd.factorize(hospital_df['Community'])[0]
hospital_df['SocialWelfare'] = pd.factorize(hospital_df['SocialWelfare'])[0]
hospital_df['Hipertension'] = pd.factorize(hospital_df['Hipertension'])[0]
hospital_df['Diabetes'] = pd.factorize(hospital_df['Diabetes'])[0]
hospital_df['Alcoholism'] = pd.factorize(hospital_df['Alcoholism'])[0]
hospital_df['Handcap'] = pd.factorize(hospital_df['Handcap'])[0]
hospital_df['SMS_received'] = pd.factorize(hospital_df['SMS_received'])[0]
hospital_df['ScheduledDate'] = pd.factorize(hospital_df['ScheduledDate'])[0]
hospital_df['AppointmentDate'] = pd.factorize(hospital_df['AppointmentDate'])[0]

In [5]:
hospital_df.head()

Unnamed: 0,PatientId,AppointmentID,Sex,ScheduledDate,AppointmentDate,Age,Community,SocialWelfare,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,47385270000000.0,5387604,0,0,0,,0,0,0,0,0,0,0,No
1,65574950000000.0,5655266,1,1,1,4.0,-1,-1,-1,0,0,0,0,No
2,126547300000.0,5745855,0,2,2,19.0,1,0,0,0,0,0,0,No
3,26817690000000.0,5700247,0,3,3,55.0,2,0,1,0,0,0,0,No
4,78135650000000.0,5656211,0,4,4,0.0,3,-1,0,0,0,0,0,No


In [6]:
# remove all nan (just for this example! You should find other ways of dealing with Nan's)
hospital_df = hospital_df.dropna()

# apply train test split
X = hospital_df.drop(["No-show"], axis=1)
y = hospital_df["No-show"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=1)

# instantiate plain decision tree classifier
clf = DecisionTreeClassifier()

# train plain decision tree classifier
clf.fit(X_train, y_train)

# predict for all appointment ids
y_pred = clf.predict(X_test)

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.80      0.81     12704
         Yes       0.30      0.33      0.31      3219

    accuracy                           0.71     15923
   macro avg       0.56      0.57      0.56     15923
weighted avg       0.72      0.71      0.71     15923



## 2. Use Classifier on Test-set

In [8]:
# get test-data
test_df = pd.read_csv("C:/Users/Vic/Desktop/Data Scienece/Datasets-20231016/test.csv")

In [9]:
hospital_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79614 entries, 1 to 88420
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PatientId        79614 non-null  float64
 1   AppointmentID    79614 non-null  int64  
 2   Sex              79614 non-null  int64  
 3   ScheduledDate    79614 non-null  int64  
 4   AppointmentDate  79614 non-null  int64  
 5   Age              79614 non-null  float64
 6   Community        79614 non-null  int64  
 7   SocialWelfare    79614 non-null  int64  
 8   Hipertension     79614 non-null  int64  
 9   Diabetes         79614 non-null  int64  
 10  Alcoholism       79614 non-null  int64  
 11  Handcap          79614 non-null  int64  
 12  SMS_received     79614 non-null  int64  
 13  No-show          79614 non-null  object 
dtypes: float64(2), int64(11), object(1)
memory usage: 9.1+ MB


In [10]:
test_df.head()

Unnamed: 0,PatientId,AppointmentID,Sex,ScheduledDate,AppointmentDate,Age,Community,SocialWelfare,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
0,92252820000000.0,5620835,F,2016-04-26T09:30:02Z,2016-05-10T00:00:00Z,30.0,SANTO ANTÔNIO,no,no,no,no,no,yes
1,66493800000000.0,5741692,M,2016-05-30T07:20:39Z,2016-05-31T00:00:00Z,51.0,CRUZAMENTO,no,no,no,,no,no
2,9986664000000.0,5673005,M,2016-05-09T08:47:24Z,2016-05-11T00:00:00Z,37.0,CENTRO,no,no,no,no,no,no
3,222566100000000.0,5579701,M,2016-04-13T15:26:41Z,2016-05-03T00:00:00Z,62.0,JARDIM DA PENHA,no,no,no,no,no,yes
4,388769000000000.0,5652332,M,2016-05-03T09:57:33Z,2016-05-24T00:00:00Z,69.0,PRAIA DO SUÁ,no,no,no,no,no,yes


In [11]:
# apply all data transformations (typically via pipeline)

# naive dummy coding approach. Your's should be better ;-)
test_df['Sex'] = pd.factorize(test_df['Sex'])[0]
test_df['Community'] = pd.factorize(test_df['Community'])[0]
test_df['SocialWelfare'] = pd.factorize(test_df['SocialWelfare'])[0]
test_df['Hipertension'] = pd.factorize(test_df['Hipertension'])[0]
test_df['Diabetes'] = pd.factorize(test_df['Diabetes'])[0]
test_df['Alcoholism'] = pd.factorize(test_df['Alcoholism'])[0]
test_df['Handcap'] = pd.factorize(test_df['Handcap'])[0]
test_df['SMS_received'] = pd.factorize(test_df['SMS_received'])[0]
test_df['ScheduledDate'] = pd.factorize(test_df['ScheduledDate'])[0]
test_df['AppointmentDate'] = pd.factorize(test_df['AppointmentDate'])[0]

In [12]:
# make predictions
predictions = clf.predict(test_df)

In [13]:
predictions

array(['No', 'No', 'Yes', ..., 'Yes', 'No', 'Yes'], dtype=object)

In [14]:
# create dataframe with IDs and predictions
submission = pd.DataFrame({'AppointmentID': test_df['AppointmentID'], 'No-show': predictions})

In [15]:
submission

Unnamed: 0,AppointmentID,No-show
0,5620835,No
1,5741692,No
2,5673005,Yes
3,5579701,Yes
4,5652332,No
...,...,...
22101,5728772,No
22102,5721034,No
22103,5413621,Yes
22104,5746319,No


In [16]:
# Naming sould have the following format
filepath = 'C:/Users/Vic/Desktop/Data Scienece/Datasets-20231016/Hospital_Predictions_1.csv'
submission.to_csv(filepath, index=False)