# tensor flow MLP

In [88]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

In [27]:
%store -r train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques
%store -r best_feats



### configurations
* save_plots -> True|False
* random_seed_state -> number, sets random state for model and for stratified splits 
* classify_bedrock_only -> True|False
* pickle_model -> True|False, wether model should be serialised and saved
* pickle_model_name -> string, name of serialised model
* grid_search -> True|False, if set to true then grid search is performed to identify optimum hyperparamaters for model 
* scale -> True|False if set to True then features scaled to all have mean value 0 and standard deviation 1
* pickle_file_path -> string,  filepath for serialised model to be saved to

In [28]:
save_plots = False
random_seed_state = 42
classify_bedrock_only = False
pickle_model = False
pickle_model_name = 'grouped'
pickle_file_path = '../../../model'
grid_search = True
scale = True
useBestFeats = False

### if only bedrock sites are classified then classes are label encoded, if bedrock sites alone are not being classified then the class sites would have already been label encoded in the 1 data_preproccessing notebook 

In [29]:
if classify_bedrock_only:
    train_data_formodel['class'], uniques = pd.factorize(train_data_formodel['class'])
    train_data_formodel = train_data_formodel[train_data_formodel['Geology']=='Bedrock']

### The class column is stored as the variable y 

In [92]:
y = np.array(train_data_formodel['class'])

In [73]:
y = pd.DataFrame(data = train_data_formodel['class'], columns = ['class'])

In [31]:
train_data_formodel.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

### The variables identified as best by the 2 feature_selection notebook are used as features

In [36]:
if useBestFeats:
    train_data_feats = train_data_formodel[best_feats]
else:
    train_data_feats = train_data_formodel[train_data_formodel.columns.values[9:-1]]

### the dimensions of the class and features are checked

In [34]:
print(X.shape)
print(y.shape)
print(len(train_data_formodel['class'].unique()))

(1158, 53)
(1158,)
25


In [67]:
scaled_feats_df = pd.DataFrame(data = my_scaler.fit_transform(train_data_feats), columns = train_data_feats.columns.values)

Subset data into that for training and evaluation and then that for predicting and plotting to graphically evaluate model performancem

### Create tensorflow objects for the features, thes contain indexes to the feature columns within the dataframe

In [68]:
tf_column_objects = []
for col in train_data_feats.columns.values:
    tf_column_objects.append(tf.feature_column.numeric_column(col))

In [93]:
if scale:
    my_scaler = StandardScaler()
    X = np.array(my_scaler.fit_transform(train_data_feats))
else:
    X = np.array(train_data_feats)

In [74]:
y.head()

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0


In [69]:
scaled_feats_df.head()

Unnamed: 0,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,...,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238
0,1.818554,-0.651984,-1.493072,0.242674,1.472687,0.275474,-0.284403,-1.489943,0.806942,-0.133532,...,-0.264463,-0.390974,-0.504395,-0.951738,-1.236944,0.084932,-0.540698,-0.536089,-0.608341,-1.401523
1,0.995105,-0.783879,-1.699639,0.242674,1.97383,0.289635,-0.279239,-2.175133,0.204106,-0.257266,...,-0.264463,-0.390974,-0.504395,-0.587038,-0.493813,0.368419,-1.210056,-0.719445,-0.39058,-1.461231
2,2.699824,-0.915775,-1.693309,0.718602,0.257396,0.349975,-0.270839,-2.622551,-0.009259,0.020547,...,-0.613126,-0.243987,-0.504395,-0.222338,-1.236944,-0.482043,-0.540698,-0.298805,-1.043861,-1.401523
3,0.927315,2.029893,-1.567873,0.242674,2.222072,-0.303571,0.30038,2.178149,1.617655,0.783982,...,-0.264463,-0.537961,-1.206954,-0.769388,-1.236944,1.218882,-1.210056,-0.104663,-1.043861,-1.520939
4,2.23327,0.22732,-1.498826,0.235404,-0.016805,0.293065,-0.285865,-1.993681,-0.765547,0.393186,...,-0.264463,-0.537961,-0.504395,-0.404688,-0.493813,-0.482043,-0.540698,-0.158591,-0.826101,-1.162692


### Split up data for training and testing

In [94]:
X_train, X_test, y_train, y_test = train_test_split(scaled_feats_df, y, test_size=0.3)

In [90]:
X_test.head()

Unnamed: 0,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,...,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238
865,0.32518,-0.739914,-0.284742,-0.215687,1.397065,0.196186,-0.268245,1.894228,2.041198,-0.395065,...,0.0842,-0.243987,0.198164,-0.404688,0.249318,0.368419,2.136733,-0.21252,0.915981,-0.744737
249,0.020125,-0.651984,0.657755,-0.298852,0.230123,0.228334,-0.269519,-2.018666,0.267292,0.399337,...,-0.264463,-0.537961,-0.504395,-0.951738,-1.236944,-0.482043,-1.210056,0.03555,-0.608341,0.150882
304,-0.382627,-0.080436,0.054166,0.306989,0.280876,-0.16934,0.028075,-1.2975,0.553311,0.798468,...,1.478851,1.37287,0.900723,0.689412,0.249318,0.368419,0.12866,0.520904,0.04494,0.125012
710,-0.28493,0.22732,-0.593729,0.390154,0.215735,-0.059668,-0.079749,0.522483,0.666233,0.520203,...,0.432863,-0.390974,0.198164,0.142362,0.992448,-0.76553,1.467376,-0.374304,0.480461,0.927084
181,-0.464374,0.007494,1.531205,0.354888,-0.715212,-0.674931,0.140496,0.447936,-0.278376,2.598219,...,3.919491,2.842739,1.603282,3.607012,1.735579,-0.198555,-0.540698,-0.126235,0.2627,0.125012


In [91]:
y_test.head()

Unnamed: 0,class
950,21
283,5
338,7
792,17
185,4


In [83]:
X_train.head()

Unnamed: 0,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,...,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238
846,0.026107,-0.95974,0.354522,-0.354646,0.699897,0.27351,-0.271569,-0.663236,1.311635,-0.154909,...,-0.613126,-0.537961,-1.206954,-0.404688,-0.493813,0.084932,-0.540698,-0.55766,0.915981,0.270297
898,-0.731546,-0.87181,-0.584523,-0.447811,-0.653677,-0.658241,0.940053,-1.225547,-1.173692,-0.350347,...,3.919491,0.012781,2.305841,3.424662,1.735579,-0.482043,-0.540698,-0.190948,0.2627,-0.028242
833,-0.522195,1.018694,-0.585674,-0.667304,-0.252124,-2.85762,3.058914,3.58273,-0.362093,-0.101059,...,0.432863,0.637935,2.305841,0.324712,0.249318,-0.198555,0.798018,-0.352733,-0.826101,0.091174
112,-0.201189,0.271285,-1.741642,-1.158397,-1.35152,0.486827,-0.285207,-0.535988,-1.935113,-0.321327,...,0.432863,-0.537961,0.198164,-0.587038,-0.493813,0.084932,1.467376,-0.417447,-1.043861,-1.341815
736,-0.23309,-0.124401,1.328666,0.95757,-0.21884,0.47792,-0.285052,-2.190151,-0.450412,-0.547343,...,-1.310451,-0.831935,-1.206954,-0.587038,-1.236944,-0.198555,-1.210056,-0.363519,-1.043861,-0.207366


In [84]:
y_train.head()

Unnamed: 0,class
930,21
983,22
917,21
116,2
818,18


In [95]:
input_function = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train, batch_size=10,num_epochs=5, shuffle=True)
input_function

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [49]:


classifier = tf.estimator.DNNClassifier(hidden_units=[10, 200, 200, 10], n_classes= len(train_data_formodel['class'].unique()), feature_columns = tf_column_objects)



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp1014terf', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6cfbf33a20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [50]:
classifier.train(input_fn=input_function,steps=50)

INFO:tensorflow:Calling model_fn.


ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.framework.ops.Tensor'>

make predictions for test data and for data that will be used to assess model performance graphically

In [None]:
pred_fn_eval = tf.estimator.inputs.pandas_input_fn(x = X_test, batch_size = len(X_test), shuffle = False)


In [None]:
predictions = list(classifier.predict(input_fn=pred_fn_eval))

In [None]:
f1_score = f1_score(y_test, predictions)

In [None]:
print(f1_score)