This project is part of Applications of Deep Neural Networks from Washington University coursework. The goal is to predict missing data values in columns a2 and a14 using Deep Neural Networks.

In [30]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("https://data.heatonresearch.com/data/t81-558/crx.csv",na_values=['?'])

In [3]:
data.head()

Unnamed: 0,a1,a2,s3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [4]:
data.isna().sum()

a1     12
a2     12
s3      0
a4      6
a5      6
a6      9
a7      9
a8      0
a9      0
a10     0
a11     0
a12     0
a13     0
a14    13
a15     0
a16     0
dtype: int64

In [5]:
dummies = pd.get_dummies(data[['a9', 'a10', 'a12', 'a13']])
dummies.shape

(690, 9)

In [6]:
from scipy.stats import zscore

data['s3'] = zscore(data['s3'])
data['a8'] = zscore(data['a8'])
data['a11'] = zscore(data['a11'])
data['a15'] = zscore(data['a15'])

In [7]:
data1 = data[['a2', 's3', 'a8', 'a11', 'a15']]
data1

Unnamed: 0,a2,s3,a8,a11,a15
0,30.83,-0.956613,-0.291083,-0.288101,-0.195413
1,58.67,-0.060051,0.244190,0.740830,-0.087852
2,24.50,-0.856102,-0.216324,-0.493887,-0.037144
3,27.83,-0.647038,0.456505,0.535044,-0.194837
4,20.17,0.174141,-0.153526,-0.493887,-0.195413
...,...,...,...,...,...
685,21.08,1.070704,-0.291083,-0.493887,-0.195413
686,22.67,-0.805846,-0.066806,-0.082314,-0.119736
687,25.25,1.757198,-0.066806,-0.288101,-0.195221
688,17.92,-0.915403,-0.652915,-0.493887,-0.051358


In [8]:
df = pd.concat([data1, dummies], axis=1)
df.shape

(690, 14)

In [9]:
train = df.dropna()
test = df[df.isna().any(1)]

In [10]:
cols = df.columns.drop('a2')
x = train[cols].values
y = train['a2'].values

In [11]:
X_test = test[cols].values

In [12]:
x_train, x_valid, y_train, y_valid = train_test_split(    
    x, y, test_size=0.25, random_state=42)

In [13]:
x_train[0]

array([0.45858887, 3.52161668, 0.74082993, 0.06042927, 0.        ,
       1.        , 0.        , 1.        , 0.        , 1.        ,
       1.        , 0.        , 0.        ])

In [14]:
model = Sequential([
                    Dense(13, input_dim=x.shape[1], activation='relu'),
                    Dense(13, activation="relu"),
                    Dense(1)
])

In [15]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto', 
                        restore_best_weights=True)

In [16]:
model.compile(loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()], optimizer='adam')
model.fit(x_train,y_train,validation_data=(x_valid,y_valid), callbacks=[monitor],
          verbose=2,epochs=1000)

Epoch 1/1000
16/16 - 1s - loss: 1136.8246 - root_mean_squared_error: 33.7168 - val_loss: 1109.0538 - val_root_mean_squared_error: 33.3025 - 1s/epoch - 84ms/step
Epoch 2/1000
16/16 - 0s - loss: 1120.4127 - root_mean_squared_error: 33.4726 - val_loss: 1091.6945 - val_root_mean_squared_error: 33.0408 - 86ms/epoch - 5ms/step
Epoch 3/1000
16/16 - 0s - loss: 1103.1393 - root_mean_squared_error: 33.2135 - val_loss: 1072.3610 - val_root_mean_squared_error: 32.7469 - 125ms/epoch - 8ms/step
Epoch 4/1000
16/16 - 0s - loss: 1082.8975 - root_mean_squared_error: 32.9074 - val_loss: 1048.4396 - val_root_mean_squared_error: 32.3796 - 102ms/epoch - 6ms/step
Epoch 5/1000
16/16 - 0s - loss: 1056.5499 - root_mean_squared_error: 32.5046 - val_loss: 1017.5302 - val_root_mean_squared_error: 31.8988 - 84ms/epoch - 5ms/step
Epoch 6/1000
16/16 - 0s - loss: 1021.7975 - root_mean_squared_error: 31.9656 - val_loss: 976.2869 - val_root_mean_squared_error: 31.2456 - 101ms/epoch - 6ms/step
Epoch 7/1000
16/16 - 0s - l

<keras.callbacks.History at 0x7f621f874550>

In [17]:
preds = model.predict(X_test)

In [18]:
test.dropna(axis=1, inplace=True)
test['a2'] = preds.reshape(-1)
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a2
83,-0.253033,0.232229,-0.493887,-0.195413,0,1,1,0,0,1,1,0,0,32.24007
86,-0.88123,-0.403221,-0.493887,-0.195413,0,1,1,0,0,1,0,0,1,28.364706
92,0.048502,1.876923,-0.493887,-0.195413,0,1,1,0,1,0,1,0,0,38.379448
97,-0.856102,-0.415182,-0.493887,-0.195413,0,1,1,0,0,1,0,0,1,28.328682
254,-0.830974,-0.590118,-0.493887,0.190655,1,0,1,0,1,0,1,0,0,28.262863
286,-0.655079,-0.664877,-0.082314,-0.175246,1,0,0,1,0,1,1,0,0,27.602304
329,-0.152521,-0.639459,-0.493887,-0.195413,1,0,1,0,0,1,1,0,0,28.456648
445,1.304896,-0.664877,-0.493887,0.803372,1,0,1,0,1,0,1,0,0,27.543591
450,-0.353544,1.42837,-0.493887,-0.195221,1,0,1,0,1,0,1,0,0,35.201546
500,-0.152521,0.8303,0.123472,0.242323,0,1,0,1,0,1,1,0,0,33.963375


In [19]:
data2 = data[['s3', 'a8', 'a11', 'a15', 'a14']]
df1 = pd.concat([data2, dummies], axis=1)
train1 = df1.dropna()
test1 = df1[df1.isna().any(1)]
cols = df1.columns.drop('a14')
x = train1[cols].values
y = train1['a14'].values

X_test1 = test1[cols].values
x_train, x_valid, y_train, y_valid = train_test_split(    
    x, y, test_size=0.25, random_state=42)


In [20]:
model = Sequential([
                    Dense(20, input_dim=x.shape[1], activation='relu'),
                    Dense(10, activation="relu"),
                    #Dense(13, activation="relu"),
                    Dense(1)
])

In [21]:
model.compile(loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()], optimizer='adam')
model.fit(x_train,y_train,validation_data=(x_valid,y_valid), callbacks=[monitor],
          verbose=2,epochs=1000)

Epoch 1/1000
16/16 - 1s - loss: 56391.4922 - root_mean_squared_error: 237.4689 - val_loss: 86070.1172 - val_root_mean_squared_error: 293.3771 - 586ms/epoch - 37ms/step
Epoch 2/1000
16/16 - 0s - loss: 56266.9414 - root_mean_squared_error: 237.2065 - val_loss: 85912.0156 - val_root_mean_squared_error: 293.1075 - 58ms/epoch - 4ms/step
Epoch 3/1000
16/16 - 0s - loss: 56118.1406 - root_mean_squared_error: 236.8927 - val_loss: 85718.0547 - val_root_mean_squared_error: 292.7765 - 46ms/epoch - 3ms/step
Epoch 4/1000
16/16 - 0s - loss: 55923.4023 - root_mean_squared_error: 236.4813 - val_loss: 85465.8438 - val_root_mean_squared_error: 292.3454 - 51ms/epoch - 3ms/step
Epoch 5/1000
16/16 - 0s - loss: 55651.1406 - root_mean_squared_error: 235.9049 - val_loss: 85097.7891 - val_root_mean_squared_error: 291.7152 - 64ms/epoch - 4ms/step
Epoch 6/1000
16/16 - 0s - loss: 55248.1484 - root_mean_squared_error: 235.0492 - val_loss: 84568.1562 - val_root_mean_squared_error: 290.8061 - 48ms/epoch - 3ms/step
Ep

<keras.callbacks.History at 0x7f621b1148d0>

In [22]:
preds1 = model.predict(X_test1)
preds1

array([[223.79079 ],
       [176.51859 ],
       [170.7066  ],
       [ 44.698105],
       [170.7066  ],
       [ 95.64273 ],
       [170.7066  ],
       [136.54079 ],
       [ 86.96582 ],
       [170.7066  ],
       [170.7066  ],
       [170.7066  ],
       [174.12695 ]], dtype=float32)

In [23]:
test1.dropna(axis=1, inplace=True)
test1['a14'] = preds1.reshape(-1)
test1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,s3,a8,a11,a15,a9_f,a9_t,a10_f,a10_t,a12_f,a12_t,a13_g,a13_p,a13_s,a14
71,-0.152521,3.073064,-0.493887,-0.195413,0,1,1,0,0,1,1,0,0,223.790787
202,-0.4038,0.007953,0.74083,-0.080169,0,1,0,1,1,0,1,0,0,176.518585
206,-0.956613,-0.664877,-0.493887,-0.195413,1,0,1,0,1,0,0,1,0,170.706604
243,0.551059,0.145509,0.535044,4.937957,0,1,0,1,1,0,1,0,0,44.698105
270,-0.956613,-0.664877,-0.493887,-0.195413,1,0,1,0,1,0,0,1,0,170.706604
278,1.757198,-0.664877,-0.493887,-0.195413,1,0,1,0,1,0,1,0,0,95.642731
330,-0.956613,-0.664877,-0.493887,-0.195413,1,0,1,0,1,0,0,1,0,170.706604
406,0.676699,-0.615536,-0.082314,-0.191956,1,0,0,1,1,0,1,0,0,136.540787
445,1.304896,-0.664877,-0.493887,0.803372,1,0,1,0,1,0,1,0,0,86.96582
456,-0.956613,-0.664877,-0.493887,-0.195413,1,0,1,0,1,0,0,1,0,170.706604


In [25]:
data.a2.fillna({83:34.329376, 86:33.027035, 92:37.63483 , 97:33.005493, 254:27.247265, 286:27.697863,
       329:27.923702, 445:25.383657, 450:31.033077, 500:34.09424 , 515:36.581947, 608:30.02712}, inplace=True)

data.a14.fillna({71:217.49971 , 202:175.8793  , 206:167.94762 ,  243:65.30137 , 270:167.94762 ,
       278:101.31148 , 330:167.94762 , 406:144.16798 , 445:102.327545, 456:167.94762 ,
       592:167.94762 , 622:167.94762 , 626:174.23894}, inplace=True)

In [29]:
data.isna().sum()

a1     12
a2      0
s3      0
a4      6
a5      6
a6      9
a7      9
a8      0
a9      0
a10     0
a11     0
a12     0
a13     0
a14     0
a15     0
a16     0
dtype: int64