In [None]:

import os

import imageio
import pandas as pd
from keras import Input, Model
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np
import keras.utils
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.model_selection import train_test_split


In [19]:
data_dir = "../data"

# Read train.csv
# TODO: rm n-rows
train = pd.read_csv(os.path.join(data_dir, "train.csv"), sep=',')
pet_ids = train["PetID"]

# Read 1 image per pet
# TODO: there are more than 1 image per pet in the folder.
# image_paths = [os.path.join(data_dir, "train_images", pet_id + "-1.jpg") for pet_id in pet_ids]
# images = [imageio.imread(path) if os.path.isfile(path) else None for path in tqdm(image_paths)]
# train = train.assign(image_1=images)


# "Age"
# "Fee"
selected_columns = ["Type",
                    "Breed1",
                    "Breed2",
                    "Gender",
                    "Color1",
                    "Color2",
                    "Color3",
                    "MaturitySize",
                    "FurLength",
                    "Vaccinated",
                    "Dewormed",
                    "Sterilized",
                    "Health",
                    "State",
                    "MaturitySize"]

# "VideoAmt"
# "PhotoAmt"
# "Quantity"

label_column = "AdoptionSpeed"




### Adding sentiment

1) Given pet_id, we read sentiment analysis results file

2) From the file we extract sentiment magnitude and score


In [22]:
# Sentiment reading

sentiment_train_dir = "../data/train_sentiment"

import json

magnitudes = []
scores = []

for pet_id in pet_ids:
    file_name = "{}.json".format(pet_id)
    file_loc = os.path.join(sentiment_train_dir, file_name)
    # check if there exists sentiment analysis for the pet, else set scores 0.0
    if os.path.isfile(file_loc):
        with open(file_loc) as json_file:  
            data = json.load(json_file)
            magnitude = data["documentSentiment"]["magnitude"]
            score = data["documentSentiment"]["score"]
    else:
        magnitude = 0.0
        score = 0.0

    magnitudes.append(magnitude)
    scores.append(score)

    
train["sent_magnitude"] = magnitudes
train["sent_score"] = score
    

In [23]:
y = train[label_column]

X = pd.get_dummies(train[selected_columns], columns=selected_columns)

# Normalize age:
#to_normalize = ["Age", "Fee", "Quantity"] # Without sentiment score
to_normalize = ["Age", "Fee", "Quantity", "sent_magnitude", "sent_score"] # With sentiment score
for to_norm in to_normalize:
    X[to_norm] = (train[to_norm] - train[to_norm].mean()) / train[to_norm].std()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

inputs = Input(shape=(len(list(X_train)),))
x = Dense(150, activation='relu')(inputs)

#x = BatchNormalization()(x)

x = Dense(250, activation='relu')(x)
x = Dense(350, activation='relu')(x)
x = Dense(450, activation='relu')(x)
x = Dense(700, activation='relu')(x)
x = Dense(500)(x)
predictions = Dense(5, activation='softmax')(x)

model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, keras.utils.to_categorical(y_train),
          batch_size=1200,
          epochs=15,
          validation_split=0.04,

          callbacks=[EarlyStopping(monitor='val_loss',
                                   min_delta=0,
                                   patience=5,
                                   verbose=0, mode='auto')])

test_predictions = model.predict(X_test)
test_predictions = [np.argmax(pred) for pred in test_predictions]
print(test_predictions)

train_pred = [np.argmax(pred) for pred in model.predict(X_train)]

print("Kappa on train: {}".format(round(cohen_kappa_score(y_train, train_pred, weights="quadratic"), 4)))
print("Accuracy on train: {}".format(round(accuracy_score(y_train, train_pred), 4)))
print("________________")
print("Kappa on test: {}".format(round(cohen_kappa_score(y_test, test_predictions, weights="quadratic"), 4)))
print("Accuracy on test: {}".format(round(accuracy_score(y_test, test_predictions), 4)))


Train on 11514 samples, validate on 480 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
[4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 3, 2, 3, 4, 4, 4, 2, 4, 3, 3, 4, 4, 4, 4, 2, 4, 4, 1, 3, 3, 4, 2, 1, 4, 2, 3, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 3, 3, 1, 4, 4, 4, 4, 4, 2, 4, 4, 3, 2, 4, 1, 2, 4, 4, 2, 4, 4, 3, 4, 1, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 3, 2, 4, 2, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 2, 4, 3, 4, 4, 1, 1, 1, 4, 4, 4, 4, 3, 3, 4, 4, 3, 3, 3, 2, 3, 4, 4, 1, 4, 4, 4, 4, 3, 2, 4, 3, 2, 1, 2, 4, 3, 4, 4, 4, 3, 3, 1, 1, 2, 1, 4, 3, 4, 4, 1, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 2, 1, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 3, 4, 4, 2, 1, 4, 4, 1, 2, 2, 4, 2, 4, 3, 2, 2, 3, 1, 4, 1, 4, 3, 2, 2, 4, 4, 1, 4, 4, 2, 4, 1, 4, 4, 4, 4, 4, 2, 4, 4, 1, 4, 3, 4, 4, 1, 1, 0, 4, 3, 2, 1, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 1, 2, 4, 4, 1, 1, 3, 3, 3, 3, 1, 3, 4, 4, 4, 3, 2, 2, 1, 4, 4, 4, 3, 2, 

Kappa on train: 0.4692
Accuracy on train: 0.5391
________________
Kappa on test: 0.2356
Accuracy on test: 0.3625
