# Čišćenje i priprema podataka

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

### Starost životinja

Ovaj stupac je tipa string i sadrži podatke poput '1 year', '3 weeks', '1 month', '3 days'.
Želimo podatke prikazati u danima.

In [3]:
def age_to_days(item):
    # convert item to list if it is one string
    if type(item) is str:
        item = [item]
    ages_in_days = []
    for i in range(len(item)):
        # check if item[i] is str
        if type(item[i]) is str:
            if 'day' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0]))
            if 'week' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*7)
            if 'month' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*30)
            if 'year' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*365)   
        else:
            # item[i] is not a string but a nan
            ages_in_days.append(0) 
    return ages_in_days

In [4]:
len(train_data.columns), len(test_data.columns)

(10, 8)

In [5]:
age_upon_outcome = train_data['AgeuponOutcome'].values
age_in_days = age_to_days(age_upon_outcome)
train_data['AgeuponOutcome'] = age_in_days

age_upon_outcome = test_data['AgeuponOutcome'].values
age_in_days = age_to_days(age_upon_outcome)
test_data['AgeuponOutcome'] = age_in_days

### Spol životinje

Stupac SexuponOutcome sadrži informacije o spolu životinje, ali i o tome je li životinja sterilizirana ili kastrirana. Smatramo da svaka od tih informacija bi bila važna za treniranje modela pa ih želimo razdvojiti u dva zasebna atributa.

In [6]:
def sex_and_intact(items):
    animal_sex = []
    intact_animal = []
    for i in range(len(items)):
        if (type(items[i]) is str) and items[i] != 'Unknown': 
            intact, sex = items[i].split(' ')
            if intact == 'Intact':
                intact_animal.append(1)
            else:
                intact_animal.append(0)
            animal_sex.append(sex)
        else:
            animal_sex.append('NaN')
            intact_animal.append('NaN')
            
    return animal_sex, intact_animal

In [7]:
sex_upon_outcome = train_data['SexuponOutcome'].values
animal_sex, intact_animal = sex_and_intact(sex_upon_outcome)
train_data['Sex'] = animal_sex
train_data['Intact'] = intact_animal

sex_upon_outcome = test_data['SexuponOutcome'].values
animal_sex, intact_animal = sex_and_intact(sex_upon_outcome)
test_data['Sex'] = animal_sex
test_data['Intact'] = intact_animal

Svojstvo spola pripremamo za učenje modela tako što vrijednostima pridružujemo odgovarajuće vektore. Vektor (1, 0) označava mužjaka, vektor (0, 1) ženku, a vektor (0, 0) nepoznat podatak. Vektore realiziramo dodavanjem novih stupaca, po jedan stupac za svaku komponentu.

In [8]:
def parse_animal_sex(data_set):
    data_set['Male'] = 0
    data_set['Female'] = 0
    
    data_set.loc[data_set['Sex'] == 'Male', 'Male'] = 1
    data_set.loc[data_set['Sex'] == 'Female', 'Female'] = 1

In [9]:
parse_animal_sex(train_data)
parse_animal_sex(test_data)

### Ime životinje

In [10]:
""" Set value of Name column to 1, if animal has name,
    to 0 if animal is nameless """
def hasName(names):
    hasName = []
    for name in names:
        if pd.isnull(name):
            hasName.append(0)
        else:
            hasName.append(1)
    return hasName

In [11]:
train_data['hasName'] = hasName(train_data['Name'])
test_data['hasName'] = hasName(test_data['Name'])

### Vrsta životinje

Kako bismo mogli uključiti vrstu životinje u model, vektorom (1, 0) ili (0, 1) označit ćemo je li ona pas ili mačka.

In [12]:
def parse_animal_type(data_set):
    data_set['Dog'] = 0
    data_set['Cat'] = 0
    
    data_set.loc[data_set['AnimalType'] == 'Dog', 'Dog'] = 1
    data_set.loc[data_set['AnimalType'] == 'Cat', 'Cat'] = 1

In [13]:
parse_animal_type(train_data)
parse_animal_type(test_data)

In [14]:
""" Set value of Color column to 1 if animal is one colored,
    else set it to 0 """
def isOneColored(colors):
    one_color = []
    for color in colors:
        if '/' not in color:
            one_color.append(1)
        else:
            one_color.append(0)
    return one_color

In [15]:
train_data['isOneColored'] = isOneColored(train_data['Color'])
test_data['isOneColored'] = isOneColored(test_data['Color'])

### Vremenske oznake 

Kako bismo mogli istražiti postoji li neki trend u udomljavanju životinja koji je povezan s datumima, godišnjim dobima i sl., razdvojili smo vremensku oznaku na dan, mjesec i godinu. Dalje, radimo detaljniju podjelu vremenskih oznaka. Dodajemo podjelu na dane u tjednu, doba dana i godišnje doba.

In [16]:
import datetime

def __date_spliter(row):
    year, month, date = tuple(row['DateTime'].split(' ')[0].split('-'))
    year = int(year)
    month = int(month)
    date = int(date)
    
    row['Date'] = date
    row['Month'] = month
    row['Year'] = year
    hour = int(row['DateTime'].split(' ')[1].split(':')[0])
    row['Hour'] = hour
    day = datetime.date(year,month,date).weekday()
    
 
    # Add day in week
    if day==0:
        row['Monday'] = 1
    elif day==1:
        row['Tuesday'] = 1
    elif day==2:
        row['Wednesday'] = 1
    elif day==3:
        row['Thursday'] = 1
    elif day==4:
        row['Friday'] = 1
    elif day==5:
        row['Saturday'] = 1
    elif day==6:
        row['Sunday'] = 1

    # Add time of the day
    if hour >= 6 and hour < 14:
        row['time_6_14'] = 1
    elif hour >= 14 and hour < 22:
        row['time_14_22'] = 1
    elif (hour >= 22 and hour < 24) | (hour >= 0 & hour <6):
        row['time_22_6'] = 1
    
    # Adding seasone to data
    if month in [1,2]:
        row['Winter'] = 1
    elif month in [4,5]:
        row['Spring'] = 1
    elif month in [7,8]:
        row['Summer'] = 1
    elif month in [10,11]:
        row['Autumn'] = 1
    elif month == 3:
        if date < 21:
            row['Winter'] = 1
        else:
            row['Spring'] = 1
    elif month == 6:
        if date < 21:
            row['Spring'] = 1
        else:
            row['Summer'] = 1
    elif month == 9:
        if date < 23:
            row['Summer'] = 1
        else:
            row['Autumn'] = 1
    elif month == 12:
        if date < 21:
            row['Autumn'] = 1
        else:
            row['Winter'] = 1
            
    return row 

def split_time(data):
    data['Date'] = 0
    data['Month'] = 0
    data['Year'] = 0
    data['Hour'] = 0
    data['Summer'] = 0
    data['Spring'] = 0
    data['Winter'] = 0
    data['Autumn'] = 0
    data['Monday'] = 0
    data['Tuesday'] = 0
    data['Wednesday'] = 0
    data['Thursday'] = 0
    data['Friday'] = 0
    data['Saturday'] = 0
    data['Sunday'] = 0
    data['time_6_14'] = 0
    data['time_14_22'] = 0
    data['time_22_6'] = 0
    data.update(data.apply(__date_spliter, axis=1))
    return data

In [17]:
train_data = split_time(train_data)
test_data = split_time(test_data)

### Boja

In [18]:
print(len(train_data['Color'].value_counts()))

366


Dakle, imamo 366 različitih vrijednosti u stupcu za boju. Sljedeća linija koda ispisuje zastupljenost boja sortiranu silazno.

In [19]:
train_data['Color'].value_counts().sort_values(ascending=False).head(20)

Black/White            2824
Black                  2292
Brown Tabby            1635
Brown Tabby/White       940
White                   931
Brown/White             884
Orange Tabby            841
Tan/White               773
Tricolor                752
Blue/White              702
Black/Tan               672
White/Black             643
Brown                   639
Tan                     628
White/Brown             569
Tortie                  530
Calico                  517
Orange Tabby/White      455
Brown Brindle/White     450
Blue                    450
Name: Color, dtype: int64

Uočili smo nekoliko ključnih riječi i svojstvo boje odlučili podijeliti na vektor ("white", "black", "brown", "blue", "tan", "tabby", "red", "calico", "orange", "chocolate", "gray", "tortie", "tricolor").

Dodajemo nove stupce u naš skup podataka:

In [20]:
def __create_color_one_hot(row):
    COLOR = 'color_'
    colors=["white", "black", "brown", "blue", "tan", "tabby", 
              "red", "calico", "orange", "chocolate", "gray", "tortie", 
              "tricolor"]
    if '/' in row['Color']:
        row['color_mix'] = 1
    color_in_colors = False
    for color in colors:
        if color in row['Color'].lower():
            row[COLOR+color] = 1
            color_in_colors = True
    if not color_in_colors:
        row[COLOR+'other'] = 1
    return row
            

def create_color_one_hot(data, colors=["white", "black", "brown", "blue", "tan", "tabby", 
              "red", "calico", "orange", "chocolate", "gray", "tortie", 
              "tricolor", 'other', 'mix']):
    COLOR = 'color_'
    for color in colors:
        data[COLOR+color] = 0
    data = data.apply(__create_color_one_hot, axis=1)
    return data

In [21]:
train_data = create_color_one_hot(train_data)
test_data = create_color_one_hot(test_data)

### Pasmina 

In [22]:
print(len(train_data['Breed'].value_counts()))

1380


Broj vrsta zasad je prevelik. Promatramo koje se vrste najčešće pojavljuju i grupiramo ostale zajedno.
Prije svega, uklanjamo riječ 'Mix' svuda gdje se pojavljuje i odvajamo pasmine ako je između njih znak '/'.
Sve vrste pasa "razbijamo" u osnovne vrste. Npr. Terrier i Terrier Mix je brojan samo pod Terrier, a Terrier/Puddle Mix je brojan kao Terrier i kao Puddle (bez Mix).
Funkcija count_and_print_breeds ispisuje koliko ima koje vrste. 

In [23]:
def count_and_print_breeds(data, feature_name='Breed'):
    # Get all different breeds
    series = data[feature_name]
    distinct_values = series.unique()
    breeds = []
    for v in distinct_values:
        if 'Mix' in v:
            if '/' in v:
                bree = v.split('/')
                for b in bree:
                    breeds.append(b)
            else:
                breeds.append(v[:-3])
        elif '/' in v:
            bree = v.split('/')
            for b in bree:
                breeds.append(b)
        else:
            breeds.append(v)
    breeds = list(set(breeds))
    breeds_count = {}
    for b in breeds:
        for row in data.iterrows():
            if b in row[1][feature_name]:
                if b in breeds_count:
                    breeds_count[b] += 1
                else:
                    breeds_count[b] = 1
    
    breeds_count = sorted(breeds_count.items(), key=lambda kv: kv[1], reverse=True)
    
    print(breeds_count)
    
#count_and_print_breeds(train_data)

Nakon pokretanja gornje funkcije, u rezultatima smo uočili učestalo pojavljivanje nekih pasmina, ali na malo drugačije načine. Tako se npr. Terrier pojavljuje u Silky Terrier, Welsh Terrier, Soft Coated Wheaten Terrier itd. Takve smo slučajeve odlučili svrstati sve u istu skupinu (Terr, Retriever, Bulldog).

In [24]:
"""
def count_and_print_breeds(data, feature_name='Breed'):
    # Get all different breeds
    series = data[feature_name]
    distinct_values = series.unique()
    breeds = []
    for v in distinct_values:
        if 'Mix' in v:
            if '/' in v:
                bree = v.split('/')
                for b in bree:
                    breeds.append(b)
            else:
                breeds.append(v[:-3].rstrip())
        elif '/' in v:
            bree = v.split('/')
            for b in bree:
                breeds.append(b)
        else:
            breeds.append(v)
    breeds = list(set(breeds))
    for index in range(len(breeds)-1,0, -1):
        if "Terr" in breeds[index]:
            del (breeds[index])
    breeds.append("Terr")
    for index in range(len(breeds)-1,0, -1):
        if "Retriever" in breeds[index]:
            del (breeds[index])
    breeds.append("Retriever")
    for index in range(len(breeds)-1,0, -1):
        if "Bulldog" in breeds[index]:
            del (breeds[index])
    breeds.append("Bulldog")
    breeds_count = {}
    for b in breeds:
        for row in data.iterrows():
            if b in row[1][feature_name]:
                if b in breeds_count:
                    breeds_count[b] += 1
                else:
                    breeds_count[b] = 1
    
    breeds_count = sorted(breeds_count.items(), key=lambda kv: kv[1], reverse=True)
    i = 0
    sum_all = 0
    sum_10 = 0
    sum_15 = 0
    sum_20 = 0
    for k,v in breeds_count:
        sum_all += v
        if i < 10:
            sum_10 += v
        if i < 15:
            sum_15 += v
        if i < 20:
            sum_20 += v
        i += 1
    print(breeds_count)
    print('all\t' + str(sum_all))
    print('sum_10\t' + str(sum_10))
    print('sum_15\t' + str(sum_15))
    print('sum_20\t' + str(sum_20))
"""

'\ndef count_and_print_breeds(data, feature_name=\'Breed\'):\n    # Get all different breeds\n    series = data[feature_name]\n    distinct_values = series.unique()\n    breeds = []\n    for v in distinct_values:\n        if \'Mix\' in v:\n            if \'/\' in v:\n                bree = v.split(\'/\')\n                for b in bree:\n                    breeds.append(b)\n            else:\n                breeds.append(v[:-3].rstrip())\n        elif \'/\' in v:\n            bree = v.split(\'/\')\n            for b in bree:\n                breeds.append(b)\n        else:\n            breeds.append(v)\n    breeds = list(set(breeds))\n    for index in range(len(breeds)-1,0, -1):\n        if "Terr" in breeds[index]:\n            del (breeds[index])\n    breeds.append("Terr")\n    for index in range(len(breeds)-1,0, -1):\n        if "Retriever" in breeds[index]:\n            del (breeds[index])\n    breeds.append("Retriever")\n    for index in range(len(breeds)-1,0, -1):\n        if "Bu

Dobili smo sljedeće rezultate:

[('Domestic Shorthair', 8971), ('Retriever', 2442), ('Chihuahua Shorthair', 2398), ('Pit Bull', 2392), ('Terr', 1630), ('German Shepherd', 962), ('Domestic Medium Hair', 884), ('Dachshund', 787), ('Australian Cattle Dog', 640), ('Domestic Longhair', 548), ('Boxer', 437), ('Border Collie', 432), ('Miniature Poodle', 431), ('Siamese', 430), ('Beagle', 311), ('Catahoula', 272), ('Australian Shepherd', 272), ('Yorkshire', 267), ('Miniature Schnauzer', 266), ('Staffordshire', 251), ('Pointer', 239), ('Bulldog', 230), ('Siberian Husky', 221), ('Rottweiler', 209), ('Chihuahua Longhair', 193), ('Shih Tzu', 190), ('Great Pyrenees', 172), ('Anatol Shepherd', 152), ('Chow Chow', 150), ('Pug', 143), ('Australian Kelpie', 133), ('Maltese', 131), ('Black', 130), ('Cardigan Welsh Corgi', 127), ... ]

all	30516
sum_10	21654
sum_15	23695
sum_20	25023

Dakle, ako uzmemo 20 skupina, oko 82% ovako podijeljenih podataka bit će obuhvaćeno tim skupinama. Radit ćemo vektor od prvih 20 ovako dobivenih vrsta, stupac koji nijedna navedena vrsta nije u odabranoj skupini ('Other') i stupac koji označava da je životinja mješanac ('Mix').

In [25]:
def __create_breed_one_hot(row):
    BREED = 'Breed_'
    breeds=['Domestic Shorthair', 'Chihuahua Shorthair', 'Terr'
                                       , 'Pit Bull', 'Retriever', 'German Shepherd'
                                       , 'Domestic Medium Hair', 'Dachshund', 'Australian Cattle Dog'
                                       , 'Domestic Longhair', 'Border Collie', 'Miniature Poodle'
                                       , 'Boxer', 'Siamese', 'Beagle', 'Catahoula', 'Yorkshire'
                                       , 'Miniature Schnauzer', 'Staffordshire'
                                       , 'Australian Shepherd']
    if 'Mix' in row['Breed']:
        row['Breed_Mix'] = 1
    if '/' in row['Breed']:
        row['Breed_Mix'] = 1
    breed_in_breeds = False
    for breed in breeds:
        if breed in row['Breed']:
            row[BREED+breed] = 1
            breed_in_breeds = True
    if not breed_in_breeds:
        row[BREED+'Other'] = 1
    return row
            

def create_breed_one_hot(data, breeds=['Domestic Shorthair', 'Chihuahua Shorthair', 'Terr'
                                       , 'Pit Bull', 'Retriever', 'German Shepherd'
                                       , 'Domestic Medium Hair', 'Dachshund', 'Australian Cattle Dog'
                                       , 'Domestic Longhair', 'Border Collie', 'Miniature Poodle'
                                       , 'Boxer', 'Siamese', 'Beagle', 'Catahoula', 'Yorkshire'
                                       , 'Miniature Schnauzer', 'Staffordshire'
                                       , 'Australian Shepherd', 'Other', 'Mix']):
    BREED = 'Breed_'
    for breed in breeds:
        data[BREED+breed] = 0
    data = data.apply(__create_breed_one_hot, axis=1)
    return data

In [26]:
train_data = create_breed_one_hot(train_data)
test_data = create_breed_one_hot(test_data)

### Ishod 

Svakom smo ishodu dodijelile broj od 0 do 4. Kako ovdje taj broj predstavlja samo krajnji ishod, nećemo te podatke koristiti za neko dodatno računanje, a u računanju greške važno je samo jesmo li ili nismo pogodili klasu, nismo imali potrebu svrstavati ih u vektor.

In [27]:
def parse_outcome(outcome):
    if outcome == "Adoption":
        return 0
    if outcome == "Died":
        return 1
    if outcome == "Euthanasia":
        return 2
    if outcome == "Return_to_owner":
        return 3
    if outcome == "Transfer":
        return 4

    raise Exception("Unknown outcome type")

In [28]:
train_data['OutcomeType'] = train_data['OutcomeType'].apply(lambda x:parse_outcome(x))

In [29]:
#saving clean data 
train_data.to_csv('data/clean_train.csv', index=False)

In [30]:
#saving clean data 
test_data.to_csv('data/clean_test.csv', index=False)

In [31]:
train_data.drop(['AnimalID','AnimalType','Sex','Name','DateTime','Breed','SexuponOutcome','OutcomeSubtype','Color'], axis=1, inplace=True)
test_data.drop(['ID','AnimalType','Sex','Name','DateTime','Breed','SexuponOutcome','Color'], axis=1, inplace=True)

In [35]:
train_data.to_csv('data/input_train.csv', index=False)
test_data.to_csv('data/input_test.csv', index=False)