# Loading packages

In [37]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import seaborn as sns
from google.colab import files
!pip install xlrd
import xlrd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler, scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Titanic dataset

In [0]:
data = files.urllib.request.urlretrieve('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls', filename=None)[0]
df = pd.read_excel(data)

## Data preprocessing

Firstly, let's get rid of features that have too many nans, as was decided in EDA.

In [0]:
df.drop(columns=['cabin', 'boat', 'body', 'home.dest', 'name'], inplace=True)

In [0]:
for column in df.columns:
  if (df[column].dtypes in ['float64', 'int64']) and sum(df[column].isnull()):
    df[column].fillna(df[column].median(), inplace=True)  
  if (df[column].dtypes == 'object') and sum(df[column].isnull()):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
sex         1309 non-null object
age         1309 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1309 non-null float64
embarked    1309 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 92.1+ KB


In [33]:
df.ticket = df.ticket.str.replace(' \d+', '').str.replace('[0-9][0-9]+', '').replace([np.nan, ''], 'simple')
df.ticket.unique() # I will drop this feature due to high range of categories

array(['simple', 'PC', 'W.E.P.', 'WE/P', 'F.C.', 'P/PP', 'C.A.',
       'C.A./SOTON', 'S.O.P.', 'F.C.C.', 'SC/AH', 'W./C.', 'S.O.C.',
       'SC/PARIS', 'S.W./PP', 'W/C', 'SCO/W', 'SO/C', 'SC/AH Basle',
       'SC/Paris', 'SC', 'S.O./P.P.', 'S.C./PARIS', 'SC/A.3', 'SW/PP',
       'CA', 'SOTON/O2', 'C', 'SOTON/O.Q.', 'A/4', 'PP', 'A/5', 'A./5.',
       'A/5.', 'SOTON/OQ', 'S.P.', 'SC/A4', 'AQ/3.', 'STON/O2.', 'A..',
       'LINE', 'STON/O.', 'AQ/4', 'A/S', 'A.5.', 'A4.', 'A/4.', 'Fa',
       'S.C./A.4.', 'LP', 'CA.', 'STON/OQ.'], dtype=object)

In [0]:
df.drop(columns='ticket', inplace=True)

In [0]:
binarizer = LabelBinarizer()
df.sex = binarizer.fit_transform(df.sex)
df = pd.get_dummies(df)

In [0]:
scaler = StandardScaler()
df[['age', 'fare']] = scaler.fit_transform(df[['age', 'fare']])

In [0]:
df = pd.scaler.fit_transform(df, df.age)

In [43]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,-0.039005,0,0,3.442584,0,0,1
1,1,1,1,-2.215952,1,2,2.286639,0,0,1
2,1,0,0,-2.131977,1,2,2.286639,0,0,1
3,1,0,1,0.038512,1,2,2.286639,0,0,1
4,1,0,0,-0.349075,1,2,2.286639,0,0,1


## Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df.survived, test_size=0.3, random_state=42)

## Neural Network building

### TensorFlow

In [45]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(916, 9)
(916,)


In [0]:
learning_rate = 0.03
n_epochs = 15

X = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.random_normal([9, 1], 0.0, 0.01, tf.float32))
bias = tf.Variable(tf.zeros([1, 1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))
accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, y), tf.float32))

In [47]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape((-1, 9)), y:y_train.values[j].reshape(-1, 1)})
      total_loss += l
      pred = sess.run(predictions, {X: X_test})
      score = sess.run(accuracy, {predictions: pred, y: y_test.values.reshape(-1, 1)})
      print('Epoch {0}: {1}, Accuracy {0}: {2}'.format(i, total_loss/n_samples, score/len(pred)))
    sess.close()

Epoch 0: 0.0009389549623930819, Accuracy 0: 0.7735368956743003
Epoch 1: 0.0011231239445865415, Accuracy 1: 0.7938931297709924
Epoch 2: 0.0012035272267187527, Accuracy 2: 0.7913486005089059
Epoch 3: 0.0012388312660450498, Accuracy 3: 0.7964376590330788
Epoch 4: 0.001254234121355948, Accuracy 4: 0.7964376590330788
Epoch 5: 0.0012605677004984893, Accuracy 5: 0.8015267175572519
Epoch 6: 0.0012626537329244823, Accuracy 6: 0.8015267175572519
Epoch 7: 0.0012627413179156042, Accuracy 7: 0.8040712468193384
Epoch 8: 0.001261928456318951, Accuracy 8: 0.8040712468193384
Epoch 9: 0.0012607665561692684, Accuracy 9: 0.8040712468193384
Epoch 10: 0.0012595346400831465, Accuracy 10: 0.8040712468193384
Epoch 11: 0.0012583664931584654, Accuracy 11: 0.806615776081425
Epoch 12: 0.0012573245831452083, Accuracy 12: 0.806615776081425
Epoch 13: 0.0012564238762751416, Accuracy 13: 0.806615776081425
Epoch 14: 0.0012556632012779535, Accuracy 14: 0.806615776081425


# Thyroid dataset

In [0]:
data2 = files.urllib.request.urlretrieve('https://www.openml.org/data/get_csv/57/dataset_57_hypothyroid.arff', filename=None)[0]
df2 = pd.read_csv(data2)

## Data preprocessing

In [0]:
df2.replace('?', np.nan, inplace=True)
df2.drop(columns='TBG', inplace=True)

In [0]:
for i in df2:
  try:
    df2[i] = df2[i].astype(float)
  except:
    continue

In [61]:
df2.age[1364] = df2.age.median()
df2.age.fillna(df2.age.median(), inplace=True)
df2.sex.fillna(df2.sex.mode().iloc[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
for column in df2.columns:
  if sum(df2[column].isnull()):
    df2[column].fillna(df2[column].median(), inplace=True)

In [65]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 29 columns):
age                          3772 non-null float64
sex                          3772 non-null object
on_thyroxine                 3772 non-null object
query_on_thyroxine           3772 non-null object
on_antithyroid_medication    3772 non-null object
sick                         3772 non-null object
pregnant                     3772 non-null object
thyroid_surgery              3772 non-null object
I131_treatment               3772 non-null object
query_hypothyroid            3772 non-null object
query_hyperthyroid           3772 non-null object
lithium                      3772 non-null object
goitre                       3772 non-null object
tumor                        3772 non-null object
hypopituitary                3772 non-null object
psych                        3772 non-null object
TSH_measured                 3772 non-null object
TSH                          3772 non-null

## Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df2.drop(columns='Class'), df2.Class, test_size=0.3, stratify=df2.Class, random_state=42)

## Neural Network building

### TensorFlow

# Закодировать переменные перед построением сети и отшакалировать их

In [68]:
n_samples = X_train.shape[0]
print( X_train.shape )
print( y_train.shape )

(2640, 28)
(2640,)


In [0]:
learning_rate = 0.03
n_epochs = 15

X = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 1])
weights = tf.Variable(tf.random_normal([9, 1], 0.0, 0.01, tf.float32))
bias = tf.Variable(tf.zeros([1, 1]))
logits = tf.add(tf.matmul(X, weights), bias)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
predictions = tf.rint(tf.nn.sigmoid(logits))
accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, y), tf.float32))

In [0]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    for i in range(n_epochs):
      total_loss=0
      for j in range(n_samples): 
        _, l = sess.run([optimizer, loss], feed_dict={X: X_train.values[j].reshape((-1, 9)), y:y_train.values[j].reshape(-1, 1)})
      total_loss += l
      pred = sess.run(predictions, {X: X_test})
      score = sess.run(accuracy, {predictions: pred, y: y_test.values.reshape(-1, 1)})
      print('Epoch {0}: {1}, Accuracy {0}: {2}'.format(i, total_loss/n_samples, score/len(pred)))
    sess.close()

Epoch 0: 0.0009389549623930819, Accuracy 0: 0.7735368956743003
Epoch 1: 0.0011231239445865415, Accuracy 1: 0.7938931297709924
Epoch 2: 0.0012035272267187527, Accuracy 2: 0.7913486005089059
Epoch 3: 0.0012388312660450498, Accuracy 3: 0.7964376590330788
Epoch 4: 0.001254234121355948, Accuracy 4: 0.7964376590330788
Epoch 5: 0.0012605677004984893, Accuracy 5: 0.8015267175572519
Epoch 6: 0.0012626537329244823, Accuracy 6: 0.8015267175572519
Epoch 7: 0.0012627413179156042, Accuracy 7: 0.8040712468193384
Epoch 8: 0.001261928456318951, Accuracy 8: 0.8040712468193384
Epoch 9: 0.0012607665561692684, Accuracy 9: 0.8040712468193384
Epoch 10: 0.0012595346400831465, Accuracy 10: 0.8040712468193384
Epoch 11: 0.0012583664931584654, Accuracy 11: 0.806615776081425
Epoch 12: 0.0012573245831452083, Accuracy 12: 0.806615776081425
Epoch 13: 0.0012564238762751416, Accuracy 13: 0.806615776081425
Epoch 14: 0.0012556632012779535, Accuracy 14: 0.806615776081425
