# Loading packages

In [1]:
!pip install xlrd
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import os
import xlrd
from google.colab import files
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler, scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import load_boston
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers, regularizers




Using TensorFlow backend.


# Titanic dataset

In [0]:
data = files.urllib.request.urlretrieve('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls', filename=None)[0]
df = pd.read_excel(data)

## Data preprocessing

In [0]:
df.drop(columns=['cabin', 'boat', 'body', 'home.dest', 'name'], inplace=True) 
#getting rid of features that have too many nans, as was decided in EDA.

In [0]:
#na fill
for column in df.columns:
  if (df[column].dtypes in ['float64', 'int64']) and sum(df[column].isnull()):
    df[column].fillna(df[column].median(), inplace=True)  
  if (df[column].dtypes == 'object') and sum(df[column].isnull()):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null object
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1309 non-null float64
embarked    1309 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 92.1+ KB


In [6]:
df.ticket = df.ticket.str.replace(' \d+', '').str.replace('[0-9][0-9]+', '').replace([np.nan, ''], 'simple')
df.ticket.unique() # I will drop this feature due to high range of categories

array(['simple', 'PC', 'W.E.P.', 'WE/P', 'F.C.', 'P/PP', 'C.A.',
       'C.A./SOTON', 'S.O.P.', 'F.C.C.', 'SC/AH', 'W./C.', 'S.O.C.',
       'SC/PARIS', 'S.W./PP', 'W/C', 'SCO/W', 'SO/C', 'SC/AH Basle',
       'SC/Paris', 'SC', 'S.O./P.P.', 'S.C./PARIS', 'SC/A.3', 'SW/PP',
       'CA', 'SOTON/O2', 'C', 'SOTON/O.Q.', 'A/4', 'PP', 'A/5', 'A./5.',
       'A/5.', 'SOTON/OQ', 'S.P.', 'SC/A4', 'AQ/3.', 'STON/O2.', 'A..',
       'LINE', 'STON/O.', 'AQ/4', 'A/S', 'A.5.', 'A4.', 'A/4.', 'Fa',
       'S.C./A.4.', 'LP', 'CA.', 'STON/OQ.'], dtype=object)

In [0]:
df.drop(columns='ticket', inplace=True)

In [0]:
binarizer = LabelBinarizer()
df.sex = binarizer.fit_transform(df.sex)
df = pd.get_dummies(df)

In [0]:
scaler = StandardScaler()
df[['age', 'fare']] = scaler.fit_transform(df[['age', 'fare']])

In [10]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,-0.039005,0,0,3.442584,0,0,1
1,1,1,1,-2.215952,1,2,2.286639,0,0,1
2,1,0,0,-2.131977,1,2,2.286639,0,0,1
3,1,0,1,0.038512,1,2,2.286639,0,0,1
4,1,0,0,-0.349075,1,2,2.286639,0,0,1


## Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df.survived, test_size=0.3, random_state=42)

## Neural Network building

### TensorFlow

In [12]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(916, 9)
(916,)


In [0]:
learning_rate = 0.01
n_epochs = 10

X = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.random_normal([9, 1], 0.0, 0.01, tf.float32))
bias = tf.Variable(tf.zeros([1, 1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.losses.sigmoid_cross_entropy(y, logits)
      #tf.losses.hinge_loss(logits=logits, labels=y)
      #tf.reduce_mean(tf.square(1-logits)) 
      #tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))
accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, y), tf.float32))

In [14]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape((-1, 9)), y:y_train.values[j].reshape(-1, 1)})
      total_loss += l
      pred = sess.run(predictions, {X: X_test})
      score = sess.run(accuracy, {predictions: pred, y: y_test.values.reshape(-1, 1)})
      print('Epoch {0}: {1}, Accuracy {0}: {2}'.format(i, total_loss/n_samples, score/len(pred)))
    sess.close()

Epoch 0: 0.0008732878224818467, Accuracy 0: 0.7888040712468194
Epoch 1: 0.001092605976038104, Accuracy 1: 0.7913486005089059
Epoch 2: 0.0011802422427714652, Accuracy 2: 0.7888040712468194
Epoch 3: 0.0012103497461460563, Accuracy 3: 0.7938931297709924
Epoch 4: 0.0012170357475114182, Accuracy 4: 0.7938931297709924
Epoch 5: 0.0012150733491739331, Accuracy 5: 0.7964376590330788
Epoch 6: 0.0012105845207730755, Accuracy 6: 0.7964376590330788
Epoch 7: 0.0012059188305550788, Accuracy 7: 0.8015267175572519
Epoch 8: 0.0012018498374905648, Accuracy 8: 0.8015267175572519
Epoch 9: 0.0011985296765789715, Accuracy 9: 0.8015267175572519


### Keras

In [15]:
adam = optimizers.Adam(lr=0.03)
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(9,)))
model.compile(adam, 'binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=1, epochs=10,
          validation_data = (X_test, y_test))

Train on 916 samples, validate on 393 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5c26cd7f28>

# Thyroid dataset

In [0]:
data2 = files.urllib.request.urlretrieve('https://www.openml.org/data/get_csv/57/dataset_57_hypothyroid.arff', filename=None)[0]
df2 = pd.read_csv(data2)

## Data preprocessing

In [0]:
df2.replace('?', np.nan, inplace=True)
df2.drop(columns=['TBG', 'TBG_measured'], inplace=True)

In [0]:
for i in df2:
  try:
    df2[i] = df2[i].astype(float)
  except:
    continue

In [19]:
df2.age[1364] = df2.age.median()
df2.age.fillna(df2.age.median(), inplace=True)
df2.sex.fillna(df2.sex.mode().iloc[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
for column in df2.columns:
  if sum(df2[column].isnull()):
    df2[column].fillna(df2[column].median(), inplace=True)

In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 28 columns):
age                          3772 non-null float64
sex                          3772 non-null object
on_thyroxine                 3772 non-null object
query_on_thyroxine           3772 non-null object
on_antithyroid_medication    3772 non-null object
sick                         3772 non-null object
pregnant                     3772 non-null object
thyroid_surgery              3772 non-null object
I131_treatment               3772 non-null object
query_hypothyroid            3772 non-null object
query_hyperthyroid           3772 non-null object
lithium                      3772 non-null object
goitre                       3772 non-null object
tumor                        3772 non-null object
hypopituitary                3772 non-null object
psych                        3772 non-null object
TSH_measured                 3772 non-null object
TSH                          3772 non-null

In [0]:
scale = StandardScaler()
df2[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']] = scale.fit_transform(df2[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']])

In [0]:
binarizer = LabelBinarizer()
encoder = LabelEncoder()
df2.Class = encoder.fit_transform(df2.Class)
for column in df2.columns:
  if len(df2[column].unique()) == 2:
    #print( column )
    df2[column] = binarizer.fit_transform(df2[column])
 

In [24]:
encoder.classes_

array(['compensated_hypothyroid', 'negative', 'primary_hypothyroid',
       'secondary_hypothyroid'], dtype=object)

In [0]:
df2 = pd.get_dummies(df2, columns=['referral_source'])

In [26]:
df2.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T4U_measured,T4U,FTI_measured,FTI,Class,referral_source_STMW,referral_source_SVHC,referral_source_SVHD,referral_source_SVI,referral_source_other
0,-0.560204,0,0,0,0,0,0,0,0,0,...,1,0.791301,1,-0.035561,1,0,1,0,0,0
1,-1.508795,0,0,0,0,0,0,0,0,0,...,0,-0.072687,0,-0.09932,1,0,0,0,0,1
2,-0.296707,1,0,0,0,0,0,0,0,0,...,1,-0.450682,1,0.315109,1,0,0,0,0,1
3,0.968081,0,1,0,0,0,0,0,0,0,...,0,-0.072687,0,-0.09932,1,0,0,0,0,1
4,0.968081,0,0,0,0,0,0,0,0,0,...,1,-0.666679,1,-1.278846,1,0,0,0,1,0


## Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df2.drop(columns='Class'), df2.Class, test_size=0.3, stratify=df2.Class, random_state=42)

In [0]:
X_train['y'] = y_train

In [0]:
major_class_indicies = X_train[X_train.y == 1].index
X_train = X_train.drop(np.random.choice(major_class_indicies, int(len(major_class_indicies)/3), replace=False), axis=0) #undersampling
y_train = X_train.y
X_train = X_train.drop(columns='y')

In [0]:
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

## Neural Network building

### TensorFlow

In [31]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(1828, 31)
(1828, 4)


In [0]:
learning_rate = 0.03
n_epochs = 10

X = tf.placeholder(tf.float32, [None, 31])
y = tf.placeholder(tf.float32, [None, 4])
weights = tf.Variable(tf.random_normal([31, 4], 0.0, 0.01, tf.float32))
bias = tf.Variable(tf.zeros([1, 4]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.losses.softmax_cross_entropy(y, logits=logits)
#tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
 
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.nn.softmax(logits)
correct_preds = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(predictions, 1), tf.argmax(y, 1)), tf.float32))
#accuracy = tf.reduce_sum(correct_preds)

In [33]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape(-1, 31), y:y_train.values[j].reshape(-1, 4)})
        total_loss += l
      pred = sess.run(predictions, {X: X_test})
      score = sess.run(correct_preds, {predictions: pred, y:y_test})
      #score = sess.run(accuracy, {correct_preds: correct_preds})
      print('Epoch {0}: {1}, Accuracy {0}: {2}, ROC_AUC {0}: {3}'.format(i, total_loss/n_samples, score/len(pred), roc_auc_score(y_test, pred)))
    sess.close()

Epoch 0: 0.2939386291159237, Accuracy 0: 0.941696113074205, ROC_AUC 0: 0.9228878918125631
Epoch 1: 0.22031901383214622, Accuracy 1: 0.941696113074205, ROC_AUC 1: 0.9427000630326413
Epoch 2: 0.20069680912859886, Accuracy 2: 0.9434628975265018, ROC_AUC 2: 0.9511429565387326
Epoch 3: 0.1900917276793822, Accuracy 3: 0.9443462897526502, ROC_AUC 3: 0.9547014509564062
Epoch 4: 0.18320103781981312, Accuracy 4: 0.9425795053003534, ROC_AUC 4: 0.9567583536878133
Epoch 5: 0.1782875675085322, Accuracy 5: 0.9434628975265018, ROC_AUC 5: 0.9597464981138175
Epoch 6: 0.17456887747156347, Accuracy 6: 0.9434628975265018, ROC_AUC 6: 0.9618964119584356
Epoch 7: 0.17163118717408857, Accuracy 7: 0.9443462897526502, ROC_AUC 7: 0.9627270605388619
Epoch 8: 0.16923375740545576, Accuracy 8: 0.946113074204947, ROC_AUC 8: 0.9642586205752788
Epoch 9: 0.16722677665668667, Accuracy 9: 0.9469964664310954, ROC_AUC 9: 0.9650779588900544


### Keras

In [34]:
adam = optimizers.Adam(lr=0.03)
model = Sequential()
model.add(Dense(4, activation='softmax', input_shape=(31,)))
model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=1, epochs=10,
          validation_data = (X_test, y_test))

Train on 1828 samples, validate on 1132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5c26d71860>

In [35]:
roc_auc_score(y_test, model.predict(X_test))

0.8979196505503092

# Boston houses dataset

In [0]:
data = load_boston()
df_feat = pd.DataFrame(data.data, columns=data.feature_names)
df_targ = pd.DataFrame(data.target)

In [37]:
print( df_feat.shape )
print( df_targ.shape )

(506, 13)
(506, 1)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(df_feat, df_targ, test_size=0.3, random_state=42)

### Simple NN

In [40]:
model = Sequential()
model.add(Dense(13, activation='relu', kernel_initializer='normal', input_shape=(13,)))
model.add(Dense(1, kernel_initializer='normal', input_shape=(13,)))
model.compile('adam', loss='mean_squared_error', metrics=['mae'])

model.fit(X_train, y_train,
          batch_size=1, epochs=15,
          validation_data=(X_test, y_test))

Train on 354 samples, validate on 152 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f5c23d92208>

### Multi-layer NN

In [41]:
model = Sequential()
model.add(Dense(13, activation='elu', kernel_initializer='normal', activity_regularizer=regularizers.l2(0.02), input_shape=(13,)))
model.add(Dense(5, activation='elu', kernel_initializer='normal', activity_regularizer=regularizers.l2(0.02), input_shape=(13,)))
model.add(Dense(3, activation='relu', kernel_initializer='normal', activity_regularizer=regularizers.l2(0.02), input_shape=(5,)))
model.add(Dense(1, kernel_initializer='normal', activity_regularizer=regularizers.l2(0.02), input_shape=(3,)))
model.compile('adam', loss='mean_squared_error', metrics=['mae'])

model.fit(X_train, y_train,
          batch_size=1, epochs=20,
          validation_data=(X_test, y_test))

Train on 354 samples, validate on 152 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f5c23990550>

Due to MAE metric, multi-layer NN outperforms better, although requires more training time