In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit,train_test_split

import keras
import tensorflow as tf
from keras.models import Sequential,load_model,Model
from keras.optimizers import *
from keras.utils import to_categorical
from keras.layers import *
from keras.callbacks import *
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
sess = tf.Session()


# Input data files are available in the "Data/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("Data"))
K.tensorflow_backend._get_available_gpus()
K.set_session(sess)

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['y_train.csv', 'sample_submission.csv', 'X_train.csv', 'X_test.csv']


In [2]:
xtrain = pd.read_csv('Data/X_train.csv')
ytrain = pd.read_csv('Data/y_train.csv')
test=pd.read_csv("Data/X_test.csv")

In [3]:
### feature extraction of orientation, angular_velocity, linear_acceleration, velocity_to_acceleration and velocity_linear_acceleration
def feature_extraction(raw_frame):
    raw_frame['orientation'] = raw_frame['orientation_X'] + raw_frame['orientation_Y'] + raw_frame['orientation_Z']+ raw_frame['orientation_W']
    raw_frame['angular_velocity'] = raw_frame['angular_velocity_X'] + raw_frame['angular_velocity_Y'] + raw_frame['angular_velocity_Z']
    raw_frame['linear_acceleration'] = raw_frame['linear_acceleration_X'] + raw_frame['linear_acceleration_Y'] + raw_frame['linear_acceleration_Y']
    raw_frame['velocity_to_acceleration'] = raw_frame['angular_velocity'] / raw_frame['linear_acceleration']
    raw_frame['velocity_linear_acceleration'] = raw_frame['linear_acceleration'] * raw_frame['angular_velocity']
    return raw_frame

In [4]:
xtrain = feature_extraction(xtrain)
test = feature_extraction(test)

In [5]:
### more feature extraction with mean, mode, std, variance, min, max and so on...

def feature_extraction_more(raw_frame):
    frame = pd.DataFrame([])
    for col in raw_frame.columns[3:]:
        frame[col + '_mean'] = raw_frame.groupby(['series_id'])[col].mean()
        frame[col + '_std'] = raw_frame.groupby(['series_id'])[col].std()
        frame[col + '_var'] = raw_frame.groupby(['series_id'])[col].var()
        frame[col + '_sem'] = raw_frame.groupby(['series_id'])[col].sem()
        frame[col + '_max'] = raw_frame.groupby(['series_id'])[col].max()
        frame[col + '_min'] = raw_frame.groupby(['series_id'])[col].min()
        frame[col + '_max_to_min'] = frame[col + '_max'] / frame[col + '_min']
        frame[col + '_max_minus_min'] = frame[col + '_max'] - frame[col + '_min']
        frame[col + '_std_to_var'] = frame[col + '_std'] * frame[col + '_var']
        frame[col + '_mean_abs_change'] = raw_frame.groupby('series_id')[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
        frame[col + '_abs_max'] = raw_frame.groupby('series_id')[col].apply(lambda x: np.max(np.abs(x)))
    return frame

In [6]:
train_df = feature_extraction_more(xtrain)
test_df = feature_extraction_more(test)

In [7]:
print("train shape",train_df.shape)
print("test shape", test_df.shape)

train shape (3810, 165)
test shape (3816, 165)


In [8]:
scaler = preprocessing.StandardScaler()
# Apply transform to both the training set and the test set.
train_df = scaler.fit_transform(train_df)
test_df = scaler.fit_transform(test_df)

In [9]:
### lable encoding 
le = preprocessing.LabelEncoder()
le.fit(ytrain.surface)
ytrain['surface'] = le.transform(ytrain.surface)
train_label = to_categorical(ytrain['surface'])
train_label.shape

(3810, 9)

In [10]:
train_x,val_x,train_y,val_y = train_test_split(train_df, train_label, test_size = 0.10, random_state=14)
train_x.shape,val_x.shape,train_y.shape,val_y.shape

((3429, 165), (381, 165), (3429, 9), (381, 9))

In [11]:
train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1))
val_x = np.reshape(val_x, (val_x.shape[0], val_x.shape[1], 1))
test_df = np.reshape(test_df, (test_df.shape[0], test_df.shape[1],1))

In [12]:
train_x.shape,val_x.shape,test_df.shape

((3429, 165, 1), (381, 165, 1), (3816, 165, 1))

In [13]:
nb_features = train_df.shape[1]
nb_out = train_label.shape[1]
nb_features,nb_out

(165, 9)

In [14]:
# https://www.kaggle.com/ist597/simple-keras-lstm-classifier-98-74
model = Sequential()
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, input_shape=((nb_features), 1)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(nb_out, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               66560     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 70,985
Trainable params: 70,985
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(train_x, train_y,
                    batch_size=32,
                    epochs=10,
                    verbose=1,
                    validation_data=(val_x, val_y))

Train on 3429 samples, validate on 381 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
prediction = model.predict(test_df)
prediction=np.argmax(prediction, axis=1) 
submission = pd.read_csv("Data/sample_submission.csv")
submission['surface'] = le.inverse_transform(prediction)
submission.to_csv('lstm_38.csv', index=False)

In [17]:
train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1]))
val_x = np.reshape(val_x, (val_x.shape[0], val_x.shape[1]))
test_df = np.reshape(test_df, (test_df.shape[0], test_df.shape[1]))
train_x.shape,val_x.shape,test_df.shape

((3429, 165), (381, 165), (3816, 165))

In [18]:
## https://www.kaggle.com/kabure/titanic-eda-keras-nn-pipelines
## Creating the model
model = Sequential()

# Inputing the first layer with input dimensions
model.add(Dense(165, 
                activation='relu',  
                input_dim = nb_features,
                kernel_initializer='uniform'))

# Adding an Dropout layer to previne from overfitting
model.add(Dropout(0.50))

#adding second hidden layer 
model.add(Dense(60,
                kernel_initializer='uniform',
                activation='relu'))

# Adding another Dropout layer
model.add(Dropout(0.50))

# adding the output layer that is binary [0,1]
model.add(Dense(nb_out, activation='softmax'))

#Visualizing the model
model.summary()

sgd = SGD(lr = 0.01, momentum = 0.9)

# Compiling our model
model.compile(optimizer = sgd, 
                   loss = 'categorical_crossentropy', 
                   metrics = ['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('cnn.hdf', save_best_only=True, 
                               monitor='val_loss', mode='min')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 165)               27390     
_________________________________________________________________
dropout_2 (Dropout)          (None, 165)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 60)                9960      
_________________________________________________________________
dropout_3 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 9)                 549       
Total params: 37,899
Trainable params: 37,899
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = model.fit(train_x, train_y,
                    batch_size=32,
                    epochs=50,
                    verbose=1,
                    validation_data=(val_x, val_y),callbacks=[early_stopping,save_best])

Train on 3429 samples, validate on 381 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


In [20]:
prediction = model.predict(test_df)
prediction=np.argmax(prediction, axis=1) 
submission = pd.read_csv("Data/sample_submission.csv")
submission['surface'] = le.inverse_transform(prediction)
submission.to_csv('cnn_74.csv', index=False)

In [21]:
train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1))
val_x = np.reshape(val_x, (val_x.shape[0], val_x.shape[1], 1))
test_df = np.reshape(test_df, (test_df.shape[0], test_df.shape[1],1))

In [22]:
## Creating the model
model = Sequential()

# Inputing the first layer with input dimensions
model.add(Dense(165,activation='relu',input_shape = (nb_features,1),kernel_initializer='uniform'))
model.add(MaxPooling1D(pool_size=2))
# Adding an Dropout layer to previne from overfitting
model.add(Dropout(0.50))
#adding second hidden layer 
model.add(Dense(128,kernel_initializer='uniform',activation='relu'))
# Adding another Dropout layer
model.add(Dropout(0.50))
model.add(GRU(64))
model.add(Dropout(0.50))
model.add(Dense(32,kernel_initializer='uniform',activation='relu'))
model.add(Dropout(0.50))
# adding the output layer that is binary [0,1]
model.add(Dense(nb_out, activation='softmax'))
#Visualizing the model
model.summary()
sgd = SGD(lr = 0.01, momentum = 0.9)
# Compiling our model
model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min')
save_best = ModelCheckpoint('cnn.hdf', save_best_only=True,monitor='val_loss', mode='min')



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 165, 165)          330       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 82, 165)           0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 82, 165)           0         
_________________________________________________________________
dense_7 (Dense)              (None, 82, 128)           21248     
_________________________________________________________________
dropout_5 (Dropout)          (None, 82, 128)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
__________

In [23]:
history = model.fit(train_x, train_y,
                    batch_size=32,
                    epochs=10,
                    verbose=1,
                    validation_data=(val_x, val_y),callbacks=[early_stopping,save_best])

Train on 3429 samples, validate on 381 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
prediction = model.predict(test_df)
prediction=np.argmax(prediction, axis=1) 
submission = pd.read_csv("Data/sample_submission.csv")
submission['surface'] = le.inverse_transform(prediction)
submission.to_csv('gru_33.csv', index=False)