In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np

train_df = pd.read_csv('data/train.csv')
origin_df = pd.read_csv('data/trainn.csv')
# Encoder our prognosis labels as integers for easier decoding later
enc = OrdinalEncoder()
train_df['prognosis'] = enc.fit_transform(train_df[['prognosis']])

# We split out our own test set so we can calculate an example MAP@K
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('prognosis', axis=1), train_df['prognosis'], random_state=42)

# initialize data
test_data = catboost_pool = Pool(X_train, 
                                 y_train)

model = CatBoostClassifier(iterations=5,
                           depth=2,
                           learning_rate=1,
                           border_count=None,
                           verbose=True)
# train the model
model.fit(X_train, y_train)
# make the prediction using the resulting model
preds_class = model.predict(test_data)

# get probabilities for all possible prognoses.
predictions = model.predict_proba(X_test)

0:	learn: 2.2002076	total: 47.3ms	remaining: 189ms
1:	learn: 2.0968816	total: 48.3ms	remaining: 72.4ms
2:	learn: 2.0326714	total: 48.9ms	remaining: 32.6ms
3:	learn: 1.9707335	total: 49.7ms	remaining: 12.4ms
4:	learn: 1.9408150	total: 50.5ms	remaining: 0us


# Generating Top K for all rows

In [2]:
# Now let's look at doing the above for a whole set of predictions at once:
sorted_prediction_ids = np.argsort(-predictions, axis=1)
top_3_prediction_ids = sorted_prediction_ids[:,:3]

# Because enc.inverse_transform expects a specific shape (a 2D array with 1 column) we can save the original shape to reshape to after decoding
original_shape = top_3_prediction_ids.shape
top_3_predictions = enc.inverse_transform(top_3_prediction_ids.reshape(-1, 1))
top_3_predictions = top_3_predictions.reshape(original_shape)
top_3_predictions[:10] # Spot check our first 10 values

array([['Dengue', 'Chikungunya', 'Tungiasis'],
       ['Chikungunya', 'Rift_Valley_fever', 'Tungiasis'],
       ['Tungiasis', 'Dengue', 'Rift_Valley_fever'],
       ['Zika', 'West_Nile_fever', 'Japanese_encephalitis'],
       ['Chikungunya', 'Dengue', 'Tungiasis'],
       ['Tungiasis', 'Dengue', 'Rift_Valley_fever'],
       ['West_Nile_fever', 'Zika', 'Japanese_encephalitis'],
       ['Zika', 'Dengue', 'Lyme_disease'],
       ['Dengue', 'Chikungunya', 'Rift_Valley_fever'],
       ['Zika', 'West_Nile_fever', 'Japanese_encephalitis']], dtype=object)

## Calculating MAP@K on our validation set
So now we have our top K (3) predictions, but what is our MAP@K?
To calculate this we'll use the mapk function from the ml_metrics library. The function is pasted below to avoid having to install the package.

In [3]:
# Sourced from the ml_metrics package at https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [4]:
# Our MAP@K score here is ~0.3456
mapk(y_test.values.reshape(-1, 1), top_3_prediction_ids, k=3)

0.27306967984934083

In [7]:
test_df = pd.read_csv('data/test.csv')

# Make predictions
predictions = model.predict_proba(test_df)

# Get the sorted indices of predictions and take the top 3
sorted_prediction_ids = np.argsort(-predictions, axis=1)
top_3_prediction_ids = sorted_prediction_ids[:,:3]

# Because enc.inverse_transform expects a specific shape (a 2D array with 1 column) we can save the original shape to reshape to after decoding
original_shape = top_3_prediction_ids.shape
top_3_predictions = enc.inverse_transform(top_3_prediction_ids.reshape(-1, 1))
top_3_predictions = top_3_predictions.reshape(original_shape)
top_3_predictions[:10] # Spot check our first 10 values

array([['Chikungunya', 'Rift_Valley_fever', 'Tungiasis'],
       ['Dengue', 'Chikungunya', 'Japanese_encephalitis'],
       ['Rift_Valley_fever', 'Dengue', 'Japanese_encephalitis'],
       ['Rift_Valley_fever', 'West_Nile_fever', 'Zika'],
       ['Lyme_disease', 'Zika', 'Malaria'],
       ['Yellow_Fever', 'Zika', 'Malaria'],
       ['Japanese_encephalitis', 'Malaria', 'Plague'],
       ['Zika', 'West_Nile_fever', 'Yellow_Fever'],
       ['Yellow_Fever', 'Chikungunya', 'Japanese_encephalitis'],
       ['Yellow_Fever', 'Zika', 'Malaria']], dtype=object)

In [8]:
# Now to get our array of labels into a single column for our submission we can just join on on a space across axis 1
test_df['prognosis'] = np.apply_along_axis(lambda x: np.array(' '.join(x), dtype="object"), 1, top_3_predictions)
test_df['prognosis'][:10] # Spot check our first 10 values

0           Chikungunya Rift_Valley_fever Tungiasis
1          Dengue Chikungunya Japanese_encephalitis
2    Rift_Valley_fever Dengue Japanese_encephalitis
3            Rift_Valley_fever West_Nile_fever Zika
4                         Lyme_disease Zika Malaria
5                         Yellow_Fever Zika Malaria
6              Japanese_encephalitis Malaria Plague
7                 Zika West_Nile_fever Yellow_Fever
8    Yellow_Fever Chikungunya Japanese_encephalitis
9                         Yellow_Fever Zika Malaria
Name: prognosis, dtype: object

In [9]:
submission = pd.concat([test_df.id, test_df.prognosis], axis=1)
submission.to_csv('submission.csv', columns=['id', 'prognosis'], index=False)
