# Categorical data in pandas

In [5]:
import pandas as pd 

In [6]:
adult = pd.read_csv('adult.csv')

In [7]:
# Check the dtypes
print(adult.dtypes)

# Create a dictionary with column names as keys and "category" as values
adult_dtypes = {
   "Workclass": "category",
   "Education": "category",
   "Relationship": "category",
   "Above/Below 50k": "category" 
}

# Read in the CSV using the dtypes parameter
adult2 = pd.read_csv(
  "adult.csv",
  dtype=adult_dtypes)

print(adult2.dtypes)

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object
Age                   int64
Workclass          category
fnlgwt                int64
Education          category
Education Num         int64
Marital Status       object
Occupation           object
Relationship       category
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k    category
dtype: object


In [8]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
adult.isna().sum()

Age                0
Workclass          0
fnlgwt             0
Education          0
Education Num      0
Marital Status     0
Occupation         0
Relationship       0
Race               0
Sex                0
Capital Gain       0
Capital Loss       0
Hours/Week         0
Country            0
Above/Below 50k    0
dtype: int64

In [10]:
adult.shape

(32561, 15)

In [11]:
adult["Marital Status"].value_counts(dropna=False)

Marital Status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [12]:
print(adult['Above/Below 50k'].describe())

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object


In [13]:
my_data = ["A", "A", "C", "B", "C", "A"]

In [14]:
my_series2 = pd.Categorical(my_data, categories=["C", "B", "A"], ordered=True)

## Grouping data by category in pandas

In [15]:
adult

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [16]:
adult.groupby("Above/Below 50k")["Age"].mean()

Above/Below 50k
<=50K    36.783738
>50K     44.249841
Name: Age, dtype: float64

In [17]:
adult.groupby("Above/Below 50k")[['Age', 'Education Num']].sum()

Unnamed: 0_level_0,Age,Education Num
Above/Below 50k,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,909294,237190
>50K,346963,91047


In [18]:
adult.groupby(by=['Above/Below 50k', 'Education']).size()

Above/Below 50k  Education   
<=50K            10th             871
                 11th            1115
                 12th             400
                 1st-4th          162
                 5th-6th          317
                 7th-8th          606
                 9th              487
                 Assoc-acdm       802
                 Assoc-voc       1021
                 Bachelors       3134
                 Doctorate        107
                 HS-grad         8826
                 Masters          764
                 Preschool         51
                 Prof-school      153
                 Some-college    5904
>50K             10th              62
                 11th              60
                 12th              33
                 1st-4th            6
                 5th-6th           16
                 7th-8th           40
                 9th               27
                 Assoc-acdm       265
                 Assoc-voc        361
                 Bac

In [19]:
adult.nunique()

Age                   73
Workclass              9
fnlgwt             21648
Education             16
Education Num         16
Marital Status         7
Occupation            15
Relationship           6
Race                   5
Sex                    2
Capital Gain         119
Capital Loss          92
Hours/Week            94
Country               42
Above/Below 50k        2
dtype: int64

In [20]:

gb = adult.groupby(by=[ "Workclass",
                        "Above/Below 50k", 
                        "Education"]).count()



In [21]:
gb

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,fnlgwt,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country
Workclass,Above/Below 50k,Education,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
?,<=50K,10th,98,98,98,98,98,98,98,98,98,98,98,98
?,<=50K,11th,118,118,118,118,118,118,118,118,118,118,118,118
?,<=50K,12th,38,38,38,38,38,38,38,38,38,38,38,38
?,<=50K,1st-4th,12,12,12,12,12,12,12,12,12,12,12,12
?,<=50K,5th-6th,28,28,28,28,28,28,28,28,28,28,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
State-gov,>50K,Some-college,31,31,31,31,31,31,31,31,31,31,31,31
Without-pay,<=50K,7th-8th,1,1,1,1,1,1,1,1,1,1,1,1
Without-pay,<=50K,Assoc-acdm,1,1,1,1,1,1,1,1,1,1,1,1
Without-pay,<=50K,HS-grad,9,9,9,9,9,9,9,9,9,9,9,9


In [22]:
# Group the adult dataset by "Sex" and "Above/Below 50k"
gb2= adult.groupby(by=["Sex", "Above/Below 50k"])

# Print out how many rows are in each created group
print(gb2.size())



Sex     Above/Below 50k
Female  <=50K               9592
        >50K                1179
Male    <=50K              15128
        >50K                6662
dtype: int64


In [23]:
# Print out the mean of each group for numeric columns only
print(gb2.mean(numeric_only=True))

                              Age         fnlgwt  Education Num  Capital Gain  \
Sex    Above/Below 50k                                                          
Female <=50K            36.210801  185999.381359       9.820475    121.986134   
       >50K             42.125530  183687.406277      11.787108   4200.389313   
Male   <=50K            37.147012  193093.609268       9.452142    165.723823   
       >50K             44.625788  188769.101321      11.580606   3971.765836   

                        Capital Loss  Hours/Week  
Sex    Above/Below 50k                            
Female <=50K               47.364470   35.916701  
       >50K               173.648855   40.426633  
Male   <=50K               56.806782   40.693879  
       >50K               198.780396   46.366106  


In [24]:
# Create a list of user-selected variables
user_list = ["Education", "Above/Below 50k"]

# Create a GroupBy object using this list
gb = adult.groupby(by=user_list)

# Find the mean for the variable "Hours/Week" for each group - Be efficient!
print(gb["Hours/Week"].mean())

Education     Above/Below 50k
10th          <=50K              36.574053
              >50K               43.774194
11th          <=50K              33.322870
              >50K               45.133333
12th          <=50K              35.035000
              >50K               44.818182
1st-4th       <=50K              37.864198
              >50K               48.833333
5th-6th       <=50K              38.539432
              >50K               46.000000
7th-8th       <=50K              38.830033
              >50K               47.500000
9th           <=50K              37.667351
              >50K               44.851852
Assoc-acdm    <=50K              39.264339
              >50K               44.256604
Assoc-voc     <=50K              40.817826
              >50K               43.853186
Bachelors     <=50K              40.586152
              >50K               45.475462
Doctorate     <=50K              45.429907
              >50K               47.513072
HS-grad       <=50K     

In [25]:
df = pd.DataFrame({
    'Animal': ['Dog', 'Cat', 'Dog', 'Cat'],
    'Speed': [40, 30, 35, 25]
})
print(df)
print()


for animal, group in df.groupby('Animal'):
    print(f"Animal: {animal}")
    print()
    print(group)

  Animal  Speed
0    Dog     40
1    Cat     30
2    Dog     35
3    Cat     25

Animal: Cat

  Animal  Speed
1    Cat     30
3    Cat     25
Animal: Dog

  Animal  Speed
0    Dog     40
2    Dog     35


## Setting category variables

In [26]:
dogs = pd.read_csv('ShelterDogs.csv')

In [27]:
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,


In [28]:
dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2937 non-null   int64  
 1   name               2845 non-null   object 
 2   age                2937 non-null   float64
 3   sex                2937 non-null   object 
 4   breed              2937 non-null   object 
 5   date_found         2937 non-null   object 
 6   adoptable_from     2937 non-null   object 
 7   posted             2937 non-null   object 
 8   color              2937 non-null   object 
 9   coat               2937 non-null   object 
 10  size               2937 non-null   object 
 11  neutered           1852 non-null   object 
 12  housebroken        460 non-null    object 
 13  likes_people       1999 non-null   object 
 14  likes_children     1219 non-null   object 
 15  get_along_males    1633 non-null   object 
 16  get_along_females  1673 

In [29]:
dogs.nunique()

ID                   1694
name                 2016
age                   233
sex                     2
breed                 277
date_found           1583
adoptable_from       1578
posted               1167
color                  23
coat                    4
size                    3
neutered                2
housebroken             2
likes_people            2
likes_children          2
get_along_males         2
get_along_females       2
get_along_cats          2
keep_in                 3
dtype: int64

In [30]:
dogs["coat"] = dogs["coat"].astype("category")
dogs["coat"].value_counts(dropna=False)

coat
short         1972
medium         565
wirehaired     220
long           180
Name: count, dtype: int64

In [31]:
dogs["coat"].dtypes

CategoricalDtype(categories=['long', 'medium', 'short', 'wirehaired'], ordered=False, categories_dtype=object)

In [32]:
dogs["coat"] = dogs["coat"].cat.set_categories(
new_categories=["short", "medium", "long"]
)

In [33]:
dogs["coat"].value_counts(dropna=False)

coat
short     1972
medium     565
NaN        220
long       180
Name: count, dtype: int64

In [34]:
dogs["coat"] = dogs["coat"].cat.set_categories(
new_categories=["short", "medium", "long"],
ordered=True
)
dogs["coat"].head(10)

0     short
1     short
2     short
3    medium
4     short
5     short
6     short
7     short
8    medium
9     short
Name: coat, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

In [35]:
dogs['likes_people'].value_counts(dropna=False)

likes_people
yes    1991
NaN     938
no        8
Name: count, dtype: int64

In [36]:
dogs['likes_people'] = dogs['likes_people'].astype("category")

In [37]:
dogs['likes_people'].dtypes

CategoricalDtype(categories=['no', 'yes'], ordered=False, categories_dtype=object)

In [38]:
dogs['likes_people'] = dogs['likes_people'].cat.add_categories(
   new_categories = ["Present", "Non_Present"])

In [39]:
dogs['likes_people'].cat.categories

Index(['no', 'yes', 'Present', 'Non_Present'], dtype='object')

In [40]:
dogs['likes_people'].value_counts()

likes_people
yes            1991
no                8
Present           0
Non_Present       0
Name: count, dtype: int64

In [41]:
dogs['likes_people'].cat.ordered

False

In [42]:
import pandas as pd

data = {
    'product': ['Laptop', 'Tablet', 'Phone', 'Tablet', 'Laptop', 'Monitor', 'Phone', 'Laptop'],
    'rating': ['High', 'Medium', 'Low', 'High', 'Low', 'Medium', 'Medium', 'High']
}

df = pd.DataFrame(data)

# Convert 'rating' column to ordered categorical
df['rating'] = pd.Categorical(df['rating'], categories=['Low', 'Medium', 'High'], ordered=True)



In [43]:
df

Unnamed: 0,product,rating
0,Laptop,High
1,Tablet,Medium
2,Phone,Low
3,Tablet,High
4,Laptop,Low
5,Monitor,Medium
6,Phone,Medium
7,Laptop,High


In [44]:
dogs

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.00,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,
3,23795,,1.00,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.00,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,Fodri,16.92,male,Unknown Mix,12/25/03,12/25/03,3/22/06,yellow-brown,short,medium,no,no,yes,yes,no,yes,no,garden
2933,262,Csibi,17.33,female,Staffordshire Terrier Mix,8/27/04,8/27/04,7/8/05,striped,short,large,yes,,,,,,,
2934,4,Konrád,18.17,male,Unknown Mix,9/21/05,9/21/05,10/26/05,black,short,medium,no,,,,,,,
2935,141,Kölni,17.17,male,Unknown Mix,11/27/04,11/27/04,5/2/05,black and brown,medium,medium,no,,,,,,,


In [45]:
# Check frequency counts while also printing the NaN count
from encodings.punycode import T


print(dogs["keep_in"].value_counts(dropna=False, normalize=True))

keep_in
both flat and garden    0.416752
NaN                     0.347634
garden                  0.173647
flat                    0.061968
Name: proportion, dtype: float64


## Renaming Categories 

In [46]:
dogs['breed'] = dogs['breed'].astype('category')

dogs['breed'].value_counts()


breed
Unknown Mix                                 1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: count, Length: 277, dtype: int64

In [47]:
dogs['breed'].values

['Unknown Mix', 'Unknown Mix', 'Unknown Mix', 'Unknown Mix', 'French Bulldog Mix', ..., 'Unknown Mix', 'Staffordshire Terrier Mix', 'Unknown Mix', 'Unknown Mix', 'German Shepherd Dog, Hound Mix']
Length: 2937
Categories (277, object): ['Adoptable From:', 'Afghan Hound', 'Akita', 'Akita Mix', ..., 'Welsh Terrier Mix', 'West Highland White Terrier Mix', 'Whippet Mix', 'Yorkshire Terrier']

In [48]:
# make a dictionary 
my_changes = {'Unknown Mix': 'not specify'}

In [49]:
# rename the catego
dogs['breed']= dogs['breed'].cat.rename_categories(my_changes)


In [50]:
dogs['breed']

0                          not specify
1                          not specify
2                          not specify
3                          not specify
4                   French Bulldog Mix
                     ...              
2932                       not specify
2933         Staffordshire Terrier Mix
2934                       not specify
2935                       not specify
2936    German Shepherd Dog, Hound Mix
Name: breed, Length: 2937, dtype: category
Categories (277, object): ['Adoptable From:', 'Afghan Hound', 'Akita', 'Akita Mix', ..., 'Welsh Terrier Mix', 'West Highland White Terrier Mix', 'Whippet Mix', 'Yorkshire Terrier']

In [51]:
from pandas import value_counts


dogs['breed'].value_counts()

breed
not specify                                 1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: count, Length: 277, dtype: int64

## Collapsing Caregories 


In [52]:
dogs['color'] = dogs['color'].astype('category')


In [53]:
dogs['color'].dtype

CategoricalDtype(categories=['apricot', 'black', 'black and brown', 'black and tan',
                  'black and white', 'brown', 'brown and white', 'dotted',
                  'golden', 'gray', 'gray and black', 'gray and white', 'red',
                  'red and white', 'sable', 'saddle back', 'spotty', 'striped',
                  'tricolor', 'white', 'wild boar', 'yellow', 'yellow-brown'],
, ordered=False, categories_dtype=object)

In [54]:
dogs['color'].cat.categories


Index(['apricot', 'black', 'black and brown', 'black and tan',
       'black and white', 'brown', 'brown and white', 'dotted', 'golden',
       'gray', 'gray and black', 'gray and white', 'red', 'red and white',
       'sable', 'saddle back', 'spotty', 'striped', 'tricolor', 'white',
       'wild boar', 'yellow', 'yellow-brown'],
      dtype='object')

In [55]:
update_colors = { "black and brown": "black", "black and tan": "black", "black and white": "black", }

In [56]:
dogs["main_color"] = dogs["color"].replace(update_colors)

  dogs["main_color"] = dogs["color"].replace(update_colors)


In [57]:
dogs.keys()

Index(['ID', 'name', 'age', 'sex', 'breed', 'date_found', 'adoptable_from',
       'posted', 'color', 'coat', 'size', 'neutered', 'housebroken',
       'likes_people', 'likes_children', 'get_along_males',
       'get_along_females', 'get_along_cats', 'keep_in', 'main_color'],
      dtype='object')

In [58]:
dogs['coat'].value_counts()

coat
short     1972
medium     565
long       180
Name: count, dtype: int64

In [59]:
dogs

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in,main_color
0,23807,Gida,0.25,female,not specify,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,,red
1,533,Frida És Ricsi,0.17,female,not specify,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,,black
2,23793,,4.00,male,not specify,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,,saddle back
3,23795,,1.00,male,not specify,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,,yellow-brown
4,23806,Amy,2.00,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,Fodri,16.92,male,not specify,12/25/03,12/25/03,3/22/06,yellow-brown,short,medium,no,no,yes,yes,no,yes,no,garden,yellow-brown
2933,262,Csibi,17.33,female,Staffordshire Terrier Mix,8/27/04,8/27/04,7/8/05,striped,short,large,yes,,,,,,,,striped
2934,4,Konrád,18.17,male,not specify,9/21/05,9/21/05,10/26/05,black,short,medium,no,,,,,,,,black
2935,141,Kölni,17.17,male,not specify,11/27/04,11/27/04,5/2/05,black and brown,medium,medium,no,,,,,,,,black


In [70]:
# Print out the current categories of the size variable

# Convert size to category type first, since it appears to be an object type
dogs['size'] = dogs['size'].astype("category")

# Now print out the categories
print(dogs['size'].cat.categories)


Index(['large', 'medium', 'small'], dtype='object')


In [71]:
print(dogs["size"].cat.categories)

Index(['large', 'medium', 'small'], dtype='object')


In [72]:

# Reorder the categories using the list provided
dogs["size"] = dogs["size"].cat.reorder_categories(

new_categories = ['small', 'medium', 'large'])

In [73]:
dogs['size'].cat.categories

Index(['small', 'medium', 'large'], dtype='object')