# **OtterBots Group Project**

Data analysis on... blurb



---


## **Import Libraries**

In [None]:
# import packages
import numpy as np
import pandas as pd



---


## **Pre-Processing Data**



*   Create dataframes from the csv files
*   Fix any issues with the dataframes for use in this project

Source of the CSV files:

[Kaggle - Canine Intelligence and Size](https://www.kaggle.com/datasets/thedevastator/canine-intelligence-and-size?select=AKC+Breed+Info.csv)

[Github - akcdata by tmfilho](https://github.com/tmfilho/akcdata)


In [None]:
# URLs
breed_info_url="https://raw.githubusercontent.com/OtterBots/DoggieData/main/AKC_Breed_Info.csv"
intel_url = "https://raw.githubusercontent.com/OtterBots/DoggieData/main/dog_intelligence.csv"
groups_url = "https://raw.githubusercontent.com/OtterBots/DoggieData/main/akc-data-latest.csv"

In [None]:
# create dataframes
df1 = pd.read_csv(breed_info_url, index_col=0)
df2 = pd.read_csv(intel_url, index_col=0)
df3 = pd.read_csv(groups_url)

In [None]:
# fix column name for df3 keep only the columns we will use
df3.rename(columns={'Unnamed: 0': 'Breed'}, inplace=True)
df3 = df3[['Breed', 'group']]

*   Rename entries where a dog breed has many different spellings across the dataframes

In [None]:
# Fix different spellings of the same breed to increase useable entries

# Mismatched spelling of breeds that are affected by merger of df1 and df2 = dd
df1['Breed'] = df1['Breed'].str.replace('Airdale Terrier', 'Airedale Terrier')
df1['Breed'] = df1['Breed'].str.replace('Cocker Spaniel-American', 'Cocker Spaniel')
df1['Breed'] = df1['Breed'].str.replace('Cocker Spaniel-English', 'English Cocker Spaniel')
df1['Breed'] = df1['Breed'].str.replace('Collie \(Rough\) & \(Smooth\)', 'Collie')
df1['Breed'] = df1['Breed'].str.replace('Old English Sheepdog \(Bobtail\)', 'Old English Sheepdog')
df1['Breed'] = df1['Breed'].str.replace('Shetland Sheepdog \(Sheltie\)', 'Shetland Sheepdog')

df2['Breed'] = df2['Breed'].str.replace('Chinese Shar Pei', 'Chinese Shar-Pei')
df2['Breed'] = df2['Breed'].str.replace('Curly Coated Retriever', 'Curly-Coated Retriever')
df2['Breed'] = df2['Breed'].str.replace('Soft-coated Wheaten Terrier', 'Soft Coated Wheaten Terrier')

# Breeds affected by merger of dd and df3 = dd
df1['Breed'] = df1['Breed'].str.replace('Chinese Shar Pei', 'Chinese Shar-Pei')
df1['Breed'] = df1['Breed'].str.replace('Curly Coated Retriever', 'Curly-Coated Retriever')
df1['Breed'] = df1['Breed'].str.replace('Flat Coated Retriever', 'Flat-Coated Retriever')
df1['Breed'] = df1['Breed'].str.replace('Otter Hound', 'Otterhound')
df1['Breed'] = df1['Breed'].str.replace('Soft-Coated Wheaten Terrier', 'Soft Coated Wheaten Terrier')

# Breeds in that don't make an impact since they are not in df2
df1['Breed'] = df1['Breed'].str.replace('Anatolin Sheepdog', 'Anatolian Shepherd Dog')

*   Merge dataframe on 'Breed' column

In [None]:
dd = pd.merge(df1,df2, on='Breed')
dd = pd.merge(dd, df3, on='Breed')



*   Replace all 'na', 'not found' to NaN



In [None]:
dd.replace({'na': 'not found'}, inplace=True)
dd.replace({'not found': np.nan}, inplace=True)



*   Drop entry ('Alaskan Malamute') because it is missing height and weight data



In [None]:
dd.dropna(subset=['height_low_inches'], inplace=True)
dd.reset_index(drop=True, inplace=True)



*   Convert height and weight columns to floats



In [None]:
dd['height_low_inches'] = dd['height_low_inches'].astype(str).astype(float)
dd['height_high_inches'] = dd['height_high_inches'].astype(str).astype(float)
dd['weight_low_lbs'] = dd['weight_low_lbs'].astype(str).astype(float)
dd['weight_high_lbs'] = dd['weight_high_lbs'].astype(str).astype(float)



*   Remove '%' from the 'obey' column
*   Convert 'obey' column to floats



In [None]:
# first remove the '%' character
dd['obey'] = dd['obey'].str.replace('%', '')

# then covert to float
dd['obey'] = dd['obey'].astype(float)

In [None]:
# View information of the datafame to be used
dd.info()

In [None]:
# locate the null entries in 'obey'
dd[dd['obey'].isnull()]

# **** what to do about the NaN ?????

### **Adding Features**


*   Average weight
*   Average height
*   Average reps
*   Height to weight ratio





In [None]:
dd['weight_avg'] = (dd['weight_high_lbs'] + dd['weight_low_lbs']) / 2
dd['height_avg'] = (dd['height_high_inches'] + dd['height_low_inches']) / 2
dd['reps_avg'] = (dd['reps_lower'] + dd['reps_upper']) / 2
dd['height_to_weight'] = dd['height_avg'] / dd['weight_avg']



---


## **Data Exploration**