# The problem 

We are trying to predict if a given patient will go to their appointment or not. 
This is a binary classification problem.

## Import the dataset

In [2]:
import pandas as pd

df = pd.read_csv('KaggleV2-May-2016.csv')

Now let's take a look at the data

In [3]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [4]:
df.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


We have 14 columns, and 110527 rows.

In [5]:
df.duplicated().sum()

0

We have no duplicated rows.
Now let's fix that column name and check for missing values.

In [6]:
df.rename(columns={'Hipertension':'Hypertension', 'Handcap':'Handicap'}, inplace=True)
df.columns = df.columns.str.lower().str.replace('-', '_')

df.head()

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [7]:
df.query('age < 0')

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
99832,465943200000000.0,5775010,F,2016-06-06T08:58:13Z,2016-06-06T00:00:00Z,-1,ROMÃO,0,0,0,0,0,0,No


In [8]:
df.drop(df.query('age < 0').index, inplace=True)

df['appointmentid'].count()

110526

The data is now clean.

# The analysis


Let's prepare the data

In [12]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

X = df.drop(['no_show', 'appointmentid', 'patientid'], axis=1)

y = df['no_show'].map({'No': 0, 'Yes': 1})

encoder = LabelEncoder()
features = X.columns
X = X.apply(encoder.fit_transform)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X = pd.DataFrame(X, columns=features)
X

Unnamed: 0,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received
0,0.0,0.267907,0.000000,0.607843,0.4875,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.265609,0.000000,0.549020,0.4875,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.265947,0.000000,0.607843,0.5625,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.267589,0.000000,0.078431,0.6750,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.265551,0.000000,0.549020,0.4875,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
110521,0.0,0.320454,0.961538,0.549020,0.5375,0.0,0.0,0.0,0.0,0.0,1.0
110522,0.0,0.310844,0.961538,0.500000,0.5375,0.0,0.0,0.0,0.0,0.0,1.0
110523,0.0,0.207983,0.961538,0.205882,0.5375,0.0,0.0,0.0,0.0,0.0,1.0
110524,0.0,0.206660,0.961538,0.372549,0.5375,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_reduced = tsne.fit_transform(X)
plt.figure(figsize=(13,10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap="jet")

KeyboardInterrupt: 