In [43]:
import pandas as pd
import numpy as np

In [2]:
dogs=pd.read_csv('/content/drive/MyDrive/DataCamp/Python Track/Categorical Data/ShelterDogs1.csv')
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,1664,Nikó,0.67,male,"Labrador Retriever, Poodle Mix",11/18/19,11/19/19,7/15/19,white,long,medium,no,,yes,maybe,yes,yes,,flat
1,1652,Dabasi Baltazár,3.33,male,Komondor,6/29/19,7/15/19,07-03-19,white,long,large,yes,,yes,maybe,no,no,no,both flat and garden
2,136,Colin,8.5,male,Scotch Collie,05-11-19,05-11-19,5/25/19,tricolor,long,medium,yes,,yes,maybe,yes,yes,no,both flat and garden
3,1169,Borzas,2.58,male,Unknown Mix,12-12-18,12-12-18,05-02-19,gray,long,medium,,yes,yes,maybe,yes,yes,,flat
4,3362,Örs,7.08,male,Puli Mix,11/22/16,11/22/16,2/24/19,black,long,medium,yes,,yes,maybe,yes,yes,,garden


In [3]:
dogs.columns

Index(['ID', 'name', 'age', 'sex', 'breed', 'date_found', 'adoptable_from',
       'posted', 'color', 'coat', 'size', 'neutered', 'housebroken',
       'likes_people', 'likes_children', 'get_along_males',
       'get_along_females', 'get_along_cats', 'keep_in'],
      dtype='object')

In [4]:
dogs.shape

(2937, 19)

# **Renaming categorie**s


---

We will work with a copy of the dogs dataframe. Bacause we can't undo once it is changed.

In [5]:
dogs2=dogs.copy()

In [6]:
print(dogs2['likes_children'].value_counts(dropna=False))

NaN       1718
maybe      596
Maybe?     357
yes        219
no          47
Name: likes_children, dtype: int64


In [7]:
print(dogs2['likes_children'].dtype)

object


### **Update NaN values to 'no'**
We are filling the blank rows of the column with 'no'. So, it will merge with the existing 'no'.

In [8]:
dogs2.loc[dogs2['likes_children'].isna(),'likes_children']='no'
print(dogs2['likes_children'].value_counts(dropna=False))

no        1765
maybe      596
Maybe?     357
yes        219
Name: likes_children, dtype: int64


### **Set "maybe" to be "no"**

In [9]:
dogs2.loc[dogs2['likes_children']=='maybe','likes_children']='no'
print(dogs2['likes_children'].value_counts(dropna=False))

no        2361
Maybe?     357
yes        219
Name: likes_children, dtype: int64


### **We can also rename 'Maybe?' to 'maybe' with this above code. But now we will try a different code below.**

### **Rename 'Maybe?' to 'maybe'**
When we use `cat.category` method, we must need to convert the column into '`category`' from '`object`'. In this method, we need to use a unique name all time. It Cannot collapse two categories into one. As example, If 'maybe' already exist, we can't use it for 2nd time. or we can't set 'maybe' for both 'yes' and 'no'.

In [10]:
my_changes={'Maybe?':'maybe'}
dogs2['likes_children']=dogs2['likes_children'].astype('category')    #converting to category from object
dogs2['likes_children']=dogs2['likes_children'].cat.rename_categories(my_changes)
print(dogs2['likes_children'].value_counts(dropna=False))

no       2361
maybe     357
yes       219
Name: likes_children, dtype: int64


### **Use a lambda function to convert all categories to uppercase using upper()**

In [11]:
dogs2["likes_children"] =  dogs2["likes_children"].cat.rename_categories(lambda c: c.upper())
print(dogs2['likes_children'].value_counts(dropna=False))
print(dogs2['likes_children'].dtype)

NO       2361
MAYBE     357
YES       219
Name: likes_children, dtype: int64
category


### **Print the list of categories**

In [12]:
print(dogs2['likes_children'].cat.categories)

Index(['MAYBE', 'NO', 'YES'], dtype='object')


# **Collapsing categories**


---
In this method, we can set a same name for multiple category at a same time. We don't face the problem of `cat.category` method. 


In [13]:
print(dogs2['coat'].value_counts(dropna=False))

short          1972
medium          562
wirehaired      220
long            180
medium-long       3
Name: coat, dtype: int64


Update 'wirehaired' and 'medium-long' to 'medium'.

In [14]:
update_coats={'wirehaired':'medium','medium-long':'medium'}
dogs2['coat'] = dogs2['coat'].replace(update_coats)
print(dogs2['coat'].value_counts())
print(dogs2['coat'].dtype)

short     1972
medium     785
long       180
Name: coat, dtype: int64
object


# **Reordering categories**


---



In [15]:
# Print out the current categories of the size variable
dogs2['size']=dogs2['size'].astype('category')
print(dogs2['size'].cat.categories)

Index(['large', 'medium', 'small'], dtype='object')


In [16]:
# Reorder the categories, specifying the Series is ordinal, and overwriting the original series
dogs2["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True)

0       medium
1        large
2       medium
3       medium
4       medium
         ...  
2932     small
2933     small
2934    medium
2935    medium
2936    medium
Name: size, Length: 2937, dtype: category
Categories (3, object): ['small' < 'medium' < 'large']

In [17]:
# How many Male/Female dogs are available of each size?
print(dogs2.groupby('size')['sex'].value_counts())

size    sex    
large   male        331
        female      188
medium  male       1083
        female      852
         MALE         6
         FEMALE       2
        Malez         1
small   male        256
        female      211
         MALE         4
         FEMALE       3
Name: sex, dtype: int64


In [18]:
# Do larger dogs need more room to roam?
print(dogs2.groupby('size')['keep_in'].value_counts())

size    keep_in             
large   both flat and garden    191
        garden                  172
        flat                      5
medium  both flat and garden    795
        garden                  317
        flat                     97
small   both flat and garden    238
        flat                     80
        garden                   21
Name: keep_in, dtype: int64


# **Cleaning variables**


---

Users of an online entry system used to have the ability to freely type in responses to questions. This is causing issues when trying to analyze the adoptable dogs dataset, dogs. Here is the current frequency table of the "sex" column:

In [19]:
print(dogs2["sex"].value_counts(dropna=False))

male       1670
female     1251
 MALE        10
 FEMALE       5
Malez         1
Name: sex, dtype: int64


Now that the system only takes responses of "female" and "male", you want this variable to match the updated system.

In [20]:
# Fix the misspelled word
replace_map = {"Malez": "male"}

# Update the sex column using the created map
dogs2["sex"] = dogs2["sex"].replace(replace_map)
print(dogs2["sex"].value_counts())

male       1671
female     1251
 MALE        10
 FEMALE       5
Name: sex, dtype: int64


In [21]:
# Strip away leading whitespace
dogs2["sex"] = dogs2["sex"].str.strip()
print(dogs2["sex"].value_counts())

male      1671
female    1251
MALE        10
FEMALE       5
Name: sex, dtype: int64


In [22]:
# Make all responses lowercase
dogs2["sex"] = dogs2["sex"].str.lower()
print(dogs2["sex"].value_counts())

male      1681
female    1256
Name: sex, dtype: int64


In [23]:
# Convert to a categorical Series
dogs2["sex"] = dogs2['sex'].astype('category')
print(dogs2['sex'].dtype)

category


# **Accessing and filtering data**


---



In [24]:
# Print the category of the coat for ID 23807
print(dogs2.loc[dogs2['ID']==23807, "coat"])

2715    short
Name: coat, dtype: object


In [25]:
# Find the count of male and female dogs who have a "long" coat
print(dogs2.loc[dogs2['coat']=='long', 'sex'].value_counts())

male      124
female     56
Name: sex, dtype: int64


In [26]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs2.loc[dogs2["breed"] == "English Cocker Spaniel", "age"].mean())

8.186153846153847


In [27]:
# Count the number of dogs that have "English" in their breed name
print(dogs2[dogs2["breed"].str.contains('English', regex=False)].shape[0])

35


# **Create a label encoding and map**


---



In [31]:
print(dogs2['color'].value_counts())

black              620
brown              293
black and brown    287
yellow-brown       212
black and white    190
black and tan      155
apricot            142
white              124
brown and white    124
sable              118
tricolor            99
saddle back         91
striped             89
gray                64
yellow              63
golden              61
red                 53
spotty              47
gray and white      35
gray and black      28
wild boar           18
red and white       14
dotted              10
Name: color, dtype: int64


### **Below code is automatic process to create codes for individuals category.**

In [35]:
dogs2['color']=dogs2['color'].astype('category')

# Create codes and categories objects
codes = dogs2['color'].cat.codes
categories = dogs2['color']
color_map = dict(zip(categories,codes))
sorted_keys = sorted(color_map.keys())

# Print the color_map dictionary with each key-value pair in a separate line
for key in sorted_keys:
    print(f'{key}: {color_map[key]}')

apricot: 0
black: 1
black and brown: 2
black and tan: 3
black and white: 4
brown: 5
brown and white: 6
dotted: 7
golden: 8
gray: 9
gray and black: 10
gray and white: 11
red: 12
red and white: 13
sable: 14
saddle back: 15
spotty: 16
striped: 17
tricolor: 18
white: 19
wild boar: 20
yellow: 21
yellow-brown: 22


In [42]:
#adding codes in column
dogs2['code of color']=codes
print(dogs2['code of color'].value_counts())

1     620
5     293
2     287
22    212
4     190
3     155
0     142
19    124
6     124
14    118
18     99
15     91
17     89
9      64
21     63
8      61
12     53
16     47
11     35
10     28
20     18
13     14
7      10
Name: code of color, dtype: int64


### **Now, How to define code manually for each category?**

In [41]:
# Define a dictionary to map categories to desired codes
colors_map={'apricot':1,'black': 2,'black and brown': 3,'black and tan': 4,
            'black and white': 5,'brown': 6,'brown and white': 7,'dotted': 8,
            'golden': 9,'gray': 10,'gray and black': 11,'gray and white': 12,
            'red': 13,'red and white': 14,'sable': 15,'saddle back': 16,
            'spotty': 17,'striped': 18,'tricolor': 19,'white': 20,
            'wild boar': 21,'yellow': 22,'yellow-brown': 23}

dogs2['color_code']=dogs2['color'].map(colors_map)
print(dogs2['color_code'].value_counts())

2     620
6     293
3     287
23    212
5     190
4     155
1     142
7     124
20    124
15    118
19     99
16     91
18     89
10     64
22     63
9      61
13     53
17     47
12     35
11     28
21     18
14     14
8      10
Name: color_code, dtype: int64


# **Creating a Boolean encoding**


---



In [46]:
# Create a Boolean column for the most common color name
dogs2['is_red']=np.where(dogs2['color'].str.contains('red'),True,False)
print(dogs2['is_red'].value_counts())

False    2870
True       67
Name: is_red, dtype: int64


This will show every red word included. like 'red', 'red and white' etc.

In [51]:
dogs2['is_black']=np.where(dogs2['color'].str.contains('black'),1,0)    #0=False
print(dogs2['is_black'].value_counts())

0    1657
1    1280
Name: is_black, dtype: int64


### **Find the memory useage.**

In [47]:
print(dogs2['is_red'].nbytes)

2937
