In [None]:
#invite everyone to the Kaggle partay

import numpy as np
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
import os
print(os.listdir("../input"))

## Loading and getting a feel for the data

In [None]:
train = pd.read_csv("../input/train.csv")

train.head()

In [None]:
train.describe()

First impressions and expectations:

__AnimalID__: probably doesn't make a difference to the outcome

__Name__: probably doesn't make a huge difference, although investigate to see if there are any common names which are more or less likely to have a certain outcome.  There are some missing values which could be down to error in records, or possibly these were animals who were never named. There may be a correlation between whether an animal has a name or not to whether it had a good outcome. 

__DateTime__: Is this referring to date when the outcome type of the animal was logged? There could be correlation between more adoptions in summer or winter months possibly. 

__OutcomeType__: target variable

__OutcomeSubtype__: I predict that this will have a strong correlation with outcome type. 

__AnimalType__: I think this will also have a strong correlation with outcome type. 

__SexuponOutcome__: I think this will probably have an effect on the outcome type. Investigate whether this is the case with all animals. 

__AgeuponOutcome__: I think that this will have an effect upon outcome type. I would think that more senior animals have a harder time getting adopted and that younger ones are more easily adopted. 

__Breed__: I imagine that this also has an effect on outcome type. 

__Color__: this most likely also has an effect on outcome type. Maybe some categories could be formed within this group as black/brown may be very similar to black for example. 

## Creating a baseline model

Before we get into digging deep into the data or thinking up any fancy models, let's create a baseline model so we have somewhere to start from and to compare future findings too. 

See if there are any missing values:

In [None]:
def missing_values(data): 
    number_of_missing_values = data.isnull().sum()
    percentage_missing = (data.isnull().sum())/data.isnull().count()
    missing_values_table = pd.concat([percentage_missing, number_of_missing_values], axis=1, keys=["Percentage missing", "Number of missing values"])
    return missing_values_table.sort_values("Percentage missing", ascending=False)
    

In [None]:
missing_values(train)

Age and sex upon outcome have less than 0.001% of values missing, so it's easier to just drop these values - it's not worth doing any fancy algorithms just to impute these values since that small percentage likely won't make any difference overall. 

The names have about 0.29% of data missing. I think it would be logical that these are animals which had been given no name so I will sub the NaN values for 'nameless' and create an extra column marking which animals had no name as a lack of name may contribute to their final outcome. 

The outcome subtype has over 50% of values missing. I predicted that outcome subtype would have a big influence on the final subtype, but this is a large amount of missing data so I will drop it for now, but keep it in mind to possibly revisit at a later stage. 

### Making amendments to adjust for missing values

In [None]:
#Drop missing values for age and sex
train.drop(train[pd.isnull(train["SexuponOutcome"])].index, inplace=True)
train.drop(train[pd.isnull(train["AgeuponOutcome"])].index, inplace=True)

In [None]:
#Drop outcome subtype entirely
train.drop(["OutcomeSubtype"], axis="columns", inplace=True)

In [None]:
#Fill missing name values
train["Name"].fillna("nameless", inplace=True)

In [None]:
#Create a new column corresponding to whether the animal had a name or not
train["HadAName"] = "yes"
indexes_of_nameless = train.loc[train["Name"]=="nameless"].index
train.at[indexes_of_nameless.values, "HadAName"] = "no"

In [None]:
#Confirm that there are no missing values left
missing_values(train)

In [None]:
#Check everything looks fine and dandy
train.head()

#make a copy of this data to be used later if I need to start from almost-scratch again
train_raw = train.copy()

### Seeing which features can be quickly used in the baseline algorithm

In [None]:
train.dtypes

For the baseline model, I will use animal type, sex upon outcome and had a name because these can be transformed into categorical variables and then encoded to create a baseline model very quickly as they don't need to be parsed or have any special attention. A baseline model isn't supposed to have a fabulous accuracy, but instead it's supposed to be built asap so that you have something to improve upon and compare future models against. 

In [None]:
baseline_model_features_names = ["AnimalType", "SexuponOutcome", "HadAName"]
baseline_model_X = train[baseline_model_features_names].copy()
baseline_model_X = pd.get_dummies(baseline_model_X, drop_first=True) #drop one column since it would be linearly dependent 
                                                        #and just makes unneeded extra columns to deal with
baseline_model_y = train["OutcomeType"].copy()
baseline_model_X.head() 

In [None]:
#Need to transform the categorical labels into numeric data
baseline_model_y.unique()

In [None]:
outcomes_dict = {"Return_to_owner" : 0,
                "Euthanasia" : 1,
                "Adoption" : 2,
                "Transfer" : 3,
                "Died" : 4}
baseline_model_y = baseline_model_y.map(outcomes_dict)
baseline_model_y.head(5)

In [None]:
#Create a training and validation set
X_train, X_test, y_train, y_test = train_test_split(baseline_model_X, baseline_model_y, random_state=36)

In [None]:
#I will choose to use a random forest model since they are ususally decent at predicting different categories. This kernel will be more about data
#exploration and feature selection and engineering as opposed to finding a specific model/parameters of the model. This is because the way the data is
#presented can often impact the accuracy much more than the model selection, so I want to refine this skill. 

#The metrics used will be log loss (since this is what Kaggle judges this competition in) and accuracy (since this is easier to interpret). 

def RF_scores(X, y):
    RF_baseline = RandomForestClassifier(random_state=36, n_estimators=100)
    y_predictions_proba = cross_val_predict(RF_baseline, X, y, method="predict_proba", cv=5)
    log_loss_score = log_loss(y, y_predictions_proba)
    accuracy = cross_val_score(RF_baseline, X, y, cv=5)
    return "The log loss is {0} and the accuracy is {1}".format(log_loss_score, np.mean(accuracy))

In [None]:
RF_scores(baseline_model_X, baseline_model_y)

## Revising the features and making them into numerical data so that they can be used

In [None]:
#Refreshing myself
train = train_raw.copy()
train.head()

In [None]:
#Drop animal ID as I don't think it will have any relevance and it's a mess to sort out. Will bear this in mind as a possibility to come back to later
#I will also drop the 'name' columns as for now I will make do with having a name or not since there are so many names to sort through otherwise
train.drop("AnimalID", axis="columns", inplace=True)
train.drop("Name", axis="columns", inplace=True)
train.head()

### Investigate whether the remaining features are suitable to be transformed into numerical data:

In [None]:
Counter(train["SexuponOutcome"])

Sex upon outcome only has 5 categories it is a good candidate to be encoded. If there are too many options, it will create too many columns and that would slow down the model. 

In [None]:
train.groupby("AnimalType")["Breed"].nunique()

There are 60 unique cat breeds and 1320 unique dog breeds in the dataset. This is way too many to work with! First I will separate these into the different species and then I will try to narrow down the number still and see if breed has any effect on tbhe outcome for the animals. 

### Investigating cat breeds

In [None]:
dog_indexes = train.loc[train["AnimalType"] == "Dog"].index
cats_data = train.copy()

In [None]:
cats_data = cats_data.groupby("AnimalType").get_group(("Cat"))
cats_data.head()

In [None]:
#Calculate percentage of cats which fall into the most popular breeds 
def percentage_in_most_popular(data, number_of_breeds_included):
    breeds_counted = Counter(data["Breed"])
    list_of_n_most_common_breeds = breeds_counted.most_common()[:number_of_breeds_included:1]
    number_of_animals_in_breeds = sum(x[1] for x in list_of_n_most_common_breeds)
    total_no_breeds = cats_data["Breed"].count()
    percentage_covered = number_of_animals_in_breeds/total_no_breeds
    return percentage_covered


In [None]:
percentage_covered_by_number_of_breeds_array = []
for breeds in np.arange(train.groupby("AnimalType")["Breed"].nunique()["Cat"]+1):    
    percentage_covered_by_number_of_breeds_array.append(percentage_in_most_popular(cats_data, breeds))

In [None]:
plt.plot(np.arange(train.groupby("AnimalType")["Breed"].nunique()["Cat"]+1), percentage_covered_by_number_of_breeds_array)
plt.title("Line plot showing percentage of cats covered by x amount of breeds")
plt.xlabel("Number of breeds")
plt.ylabel("Percentage of animals covered")
plt.show()

In [None]:
plt.plot(np.arange(train.groupby("AnimalType")["Breed"].nunique()["Cat"]+1), percentage_covered_by_number_of_breeds_array)
plt.title("Line plot showing percentage of cats covered by x amount of breeds - Zoomed in")
plt.xlabel("Number of breeds")
plt.ylabel("Percentage of animals covered")
plt.xlim(0,8)
plt.show()

In [None]:
percentage_covered_by_number_of_breeds_array[4]

Looking at the second elbow in the graph, ~ 95% of animals are counted for if we only use the top 4 breeds of cat. This seems sufficient and greatly reduces the number of breeds, lowering the complexity and time to complete any algorithms. Therefore, I shall use the top 4 breed of cat and label the rest as 'other' and I will now investigate whether breed of cat actually has any correlation to the cats' final outcome. 

In [None]:
indexes_of_breed = [3,3,4,5]
for a in np.arange(1):

    indexes_of_next_breed = np.array(cats_data.loc[cats_data["Breed"] == [x[0] for x in Counter(cats_data["Breed"]).most_common()[a:a+1:1]][0]].index)
   # indexes_of_breed + indexes_of_next_breed
    #print(indexes_of_next_breed)
    
#new = indexes_of_breed + indexes_of_next_breed
indexes_of_next_breed

In [None]:
#if breed name is not in top 4 common
#find all its indexes 
#change breed value to 'other'

cat_breeds = train.groupby("AnimalType")["Breed"].unique()["Cat"]
number_of_cat_breeds = train.groupby("AnimalType")["Breed"].nunique()["Cat"]

for breed in cat_breeds:
    if breed in [x[0] for x in Counter(cats_data["Breed"]).most_common()[:-(number_of_cat_breeds-4):-1]]:
        breed_indexes = cats_data.loc[cats_data["Breed"] == breed].index
        cats_data.at[breed_indexes.values, "Breed"] = "other"

In [None]:
cats_data.head(10)

Now that I have simplified the breed data, let's see if there's any correlation between breed and outcomes. 

In [None]:
outcomes_dict = {"Return_to_owner" : 0,
                "Euthanasia" : 1,
                "Adoption" : 2,
                "Transfer" : 3,
                "Died" : 4}
y = cats_data["OutcomeType"].map(outcomes_dict)

In [None]:

news = cats_data.groupby(["Breed", "OutcomeType"]).size()
cats_outcomes_breeds = pd.DataFrame(news.reset_index())

cats_outcomes_breeds

In [None]:
cats_outcomes_breeds.groupby("Breed").get_group("other")[0]

In [None]:
#Plot the proportion of each breed that had each outcome

fig, ax = plt.subplots(2, 2, figsize=(10,10))

domestic_longhair = cats_outcomes_breeds.groupby("Breed").get_group("Domestic Longhair Mix")[0]
domestic_mediumhair = cats_outcomes_breeds.groupby("Breed").get_group("Domestic Medium Hair Mix")[0]
domestic_shorthair = cats_outcomes_breeds.groupby("Breed").get_group("Domestic Shorthair Mix")[0]
other = cats_outcomes_breeds.groupby("Breed").get_group("other")[0]

wedges, texts, autotexts = ax[0,0].pie(domestic_longhair, autopct="%.0f%%")
ax[0,0].set_title("Domestic Longhair")
wedges, texts, autotexts = ax[0,1].pie(domestic_mediumhair, autopct="%.0f%%")
ax[0,1].set_title("Domestic Mediumhair")
wedges, texts, autotexts = ax[1,0].pie(domestic_shorthair, autopct="%.0f%%")
ax[1,0].set_title("Domestic Shorthair")
wedges, texts, autotexts = ax[1,1].pie(other, autopct="%.0f%%")
ax[1,1].set_title("Other")

plt.suptitle("Pi charts showing the percentage of each outcome of the top 4 breeds of cat")
labels = ["Adoption", "Died", "Euthanasia", "Return to owner", "Transfer"]
plt.legend(wedges, labels, loc="upper right", bbox_to_anchor=(1.1, 0.3, 0.5, 1), fontsize="large")
plt.show()

These plots show me that the breed is likely related to outcome. If you're a domestic shorthair, you're more likely to be transferred than if you were any other breed and if you're a domestic longhair, you're more likely to be euthanised than any other breed. Now that I've condensed the breed information and seen that it makes sense for breed to be related to outcome, I will transform it into a categorical variable and use add it into the algorithm to see if it improves the score. 

In [None]:
cats_data.head()