# Pre-processing Doggy Data to deploy for training

In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read in DataFrame
doggy_raw_df = pd.read_csv('../Exploration/cleaned_data/doggy_master_merged.csv')

In [3]:
# Look at the first 5 rows
doggy_raw_df.head(5)

Unnamed: 0.1,Unnamed: 0,Borough,humans/household,dfriendlyareas_by_zip,avg_income,Breed,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value,expectancy_value,average_weight,size_cat,lifetime_cost
0,0,Bronx,2.750349,3.0,64590.15519,Boxer,0.4,0.4,0.8,0.8,1.0,0.6,21.545638,Small 9-35lb,Low
1,1,Manhattan,1.886575,1.0,407561.3494,Maltese,0.8,0.2,0.6,0.6,1.0,0.8,3.175147,Toy >9lb,Medium
2,2,Manhattan,1.886575,2.0,269713.3825,Yorkshire Terrier,1.0,0.2,0.6,0.2,0.8,0.8,3.175147,Toy >9lb,Medium
3,3,Brooklyn,2.551367,2.0,32736.8459,German Shepherd Dog,0.4,0.8,0.6,1.0,0.6,0.4,31.751466,Small 9-35lb,Low
4,4,Brooklyn,2.551367,1.0,115964.9035,Cavalier King Charles Spaniel,0.4,0.4,0.4,1.0,0.8,0.8,7.030682,Toy >9lb,Medium


In [4]:
# Look at dtypes and info
doggy_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275755 entries, 0 to 275754
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                275755 non-null  int64  
 1   Borough                   275755 non-null  object 
 2   humans/household          275755 non-null  float64
 3   dfriendlyareas_by_zip     275755 non-null  float64
 4   avg_income                275755 non-null  float64
 5   Breed                     275755 non-null  object 
 6   grooming_frequency_value  275755 non-null  float64
 7   shedding_value            275755 non-null  float64
 8   energy_level_value        275755 non-null  float64
 9   trainability_value        275755 non-null  float64
 10  demeanor_value            275755 non-null  float64
 11  expectancy_value          275755 non-null  float64
 12  average_weight            275755 non-null  float64
 13  size_cat                  275755 non-null  o

In [6]:
# Clip out the first column, drop 'humans/household', average_weight
doggy_raw_df = doggy_raw_df[['Borough', 'dfriendlyareas_by_zip',
       'avg_income', 'Breed', 'grooming_frequency_value', 'shedding_value',
       'energy_level_value', 'trainability_value', 'demeanor_value',
       'expectancy_value', 'size_cat', 'lifetime_cost']]

doggy_raw_df.columns

Index(['Borough', 'dfriendlyareas_by_zip', 'avg_income', 'Breed',
       'grooming_frequency_value', 'shedding_value', 'energy_level_value',
       'trainability_value', 'demeanor_value', 'expectancy_value', 'size_cat',
       'lifetime_cost'],
      dtype='object')

In [8]:
# Rename columns
doggy_raw_df.columns = ['Borough', 'dog_friendly_area', 'income_area', 'Breed',
       'grooming_frequency', 'shedding', 'energy_level',
       'trainability', 'demeanor', 'life_expectancy',
       'size', 'lifetime_cost']

doggy_raw_df.head(3)

Unnamed: 0,Borough,dog_friendly_area,income_area,Breed,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,size,lifetime_cost
0,Bronx,3.0,64590.15519,Boxer,0.4,0.4,0.8,0.8,1.0,0.6,Small 9-35lb,Low
1,Manhattan,1.0,407561.3494,Maltese,0.8,0.2,0.6,0.6,1.0,0.8,Toy >9lb,Medium
2,Manhattan,2.0,269713.3825,Yorkshire Terrier,1.0,0.2,0.6,0.2,0.8,0.8,Toy >9lb,Medium


In [12]:
# Convert income_area from from numerical to categorical
income_list = list(doggy_raw_df['income_area'])

income_list[0:20]
categorized_income = []

for avg_income in income_list:
    # low_income
    if 0 <= avg_income < 50000:
        categorized_income.append('low')
    # middle_income
    elif 50000 <= avg_income < 100000:
        categorized_income.append('middle')
    # high_income
    elif 100000 <= avg_income:
        categorized_income.append('high')
    else:
        categorized_income.append('Other')
    
print(len(income_list), len(categorized_income))

doggy_raw_df['income_cat'] = categorized_income

doggy_raw_df[['income_area', 'income_cat']].head(20)

275755 275755


Unnamed: 0,income_area,income_cat
0,64590.15519,middle
1,407561.3494,high
2,269713.3825,high
3,32736.8459,low
4,115964.9035,high
5,459611.1391,high
6,80795.01054,middle
7,81070.33657,middle
8,67897.0631,middle
9,173862.6105,high


In [14]:
# Drop 'income_area' column
doggy_raw_df.drop(labels='income_area', axis=1, inplace=True)

doggy_raw_df.head(1)

Unnamed: 0,Borough,dog_friendly_area,Breed,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,size,lifetime_cost,income_cat
0,Bronx,3.0,Boxer,0.4,0.4,0.8,0.8,1.0,0.6,Small 9-35lb,Low,middle


In [17]:
# Categorize 'dog_friendly_area' from numerical
friendly_list = list(doggy_raw_df['dog_friendly_area'])

friendly_area = []

for i in friendly_list:
    # No
    if i < 1:
        friendly_area.append('No')
    else:
        friendly_area.append('Yes')
    
print(len(friendly_list), len(friendly_area))

doggy_raw_df['dog_friendly'] = friendly_area

doggy_raw_df[['dog_friendly_area', 'dog_friendly']].sample(20)

275755 275755


Unnamed: 0,dog_friendly_area,dog_friendly
36322,2.0,Yes
67244,1.0,Yes
99260,1.0,Yes
636,2.0,Yes
106964,2.0,Yes
191836,0.0,No
273179,1.0,Yes
176942,2.0,Yes
42811,4.0,Yes
66779,0.0,No


In [18]:
# Drop 'income_area' column
doggy_raw_df.drop(labels='dog_friendly_area', axis=1, inplace=True)

doggy_raw_df.head(1)

Unnamed: 0,Borough,Breed,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,size,lifetime_cost,income_cat,dog_friendly
0,Bronx,Boxer,0.4,0.4,0.8,0.8,1.0,0.6,Small 9-35lb,Low,middle,Yes


In [19]:
doggy_raw_df.columns

Index(['Borough', 'Breed', 'grooming_frequency', 'shedding', 'energy_level',
       'trainability', 'demeanor', 'life_expectancy', 'size', 'lifetime_cost',
       'income_cat', 'dog_friendly'],
      dtype='object')

In [20]:
# Rearrange column order
doggy_raw_df = doggy_raw_df[['Borough', 'dog_friendly', 'income_cat','grooming_frequency', 'shedding', 'energy_level',
       'trainability', 'demeanor', 'size', 'life_expectancy', 'lifetime_cost', 'Breed']]

doggy_raw_df.head(1)

Unnamed: 0,Borough,dog_friendly,income_cat,grooming_frequency,shedding,energy_level,trainability,demeanor,size,life_expectancy,lifetime_cost,Breed
0,Bronx,Yes,middle,0.4,0.4,0.8,0.8,1.0,Small 9-35lb,0.6,Low,Boxer
