# Categorical data in pandas

In [1]:
import pandas as pd 

In [2]:
adult = pd.read_csv('adult.csv')

In [3]:
# Check the dtypes
print(adult.dtypes)

# Create a dictionary with column names as keys and "category" as values
adult_dtypes = {
   "Workclass": "category",
   "Education": "category",
   "Relationship": "category",
   "Above/Below 50k": "category" 
}

# Read in the CSV using the dtypes parameter
adult2 = pd.read_csv(
  "adult.csv",
  dtype=adult_dtypes)

print(adult2.dtypes)

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object
Age                   int64
Workclass          category
fnlgwt                int64
Education          category
Education Num         int64
Marital Status       object
Occupation           object
Relationship       category
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k    category
dtype: object


In [3]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
adult.isna().sum()

Age                0
Workclass          0
fnlgwt             0
Education          0
Education Num      0
Marital Status     0
Occupation         0
Relationship       0
Race               0
Sex                0
Capital Gain       0
Capital Loss       0
Hours/Week         0
Country            0
Above/Below 50k    0
dtype: int64

In [4]:
adult.shape

(32561, 15)

In [5]:
adult["Marital Status"].value_counts(dropna=False)

Marital Status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [7]:
print(adult['Above/Below 50k'].describe())

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object


In [8]:
my_data = ["A", "A", "C", "B", "C", "A"]

In [None]:
my_series2 = pd.Categorical(my_data, categories=["C", "B", "A"], ordered=True)

## Grouping data by category in pandas

In [5]:
adult

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
adult.groupby("Above/Below 50k")["Age"].mean()

Above/Below 50k
<=50K    36.783738
>50K     44.249841
Name: Age, dtype: float64

In [10]:
adult.groupby("Above/Below 50k")[['Age', 'Education Num']].sum()

Unnamed: 0_level_0,Age,Education Num
Above/Below 50k,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,909294,237190
>50K,346963,91047


In [13]:
adult.groupby(by=['Above/Below 50k', 'Education']).size()

Above/Below 50k  Education   
<=50K            10th             871
                 11th            1115
                 12th             400
                 1st-4th          162
                 5th-6th          317
                 7th-8th          606
                 9th              487
                 Assoc-acdm       802
                 Assoc-voc       1021
                 Bachelors       3134
                 Doctorate        107
                 HS-grad         8826
                 Masters          764
                 Preschool         51
                 Prof-school      153
                 Some-college    5904
>50K             10th              62
                 11th              60
                 12th              33
                 1st-4th            6
                 5th-6th           16
                 7th-8th           40
                 9th               27
                 Assoc-acdm       265
                 Assoc-voc        361
                 Bac

In [14]:
adult.nunique()

Age                   73
Workclass              9
fnlgwt             21648
Education             16
Education Num         16
Marital Status         7
Occupation            15
Relationship           6
Race                   5
Sex                    2
Capital Gain         119
Capital Loss          92
Hours/Week            94
Country               42
Above/Below 50k        2
dtype: int64

In [26]:

gb = adult.groupby(by=[ "Workclass",
                        "Above/Below 50k", 
                        "Education"]).count()



In [33]:
gb

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,fnlgwt,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country
Workclass,Above/Below 50k,Education,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
?,<=50K,10th,98,98,98,98,98,98,98,98,98,98,98,98
?,<=50K,11th,118,118,118,118,118,118,118,118,118,118,118,118
?,<=50K,12th,38,38,38,38,38,38,38,38,38,38,38,38
?,<=50K,1st-4th,12,12,12,12,12,12,12,12,12,12,12,12
?,<=50K,5th-6th,28,28,28,28,28,28,28,28,28,28,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
State-gov,>50K,Some-college,31,31,31,31,31,31,31,31,31,31,31,31
Without-pay,<=50K,7th-8th,1,1,1,1,1,1,1,1,1,1,1,1
Without-pay,<=50K,Assoc-acdm,1,1,1,1,1,1,1,1,1,1,1,1
Without-pay,<=50K,HS-grad,9,9,9,9,9,9,9,9,9,9,9,9


In [35]:
# Group the adult dataset by "Sex" and "Above/Below 50k"
gb2= adult.groupby(by=["Sex", "Above/Below 50k"])

# Print out how many rows are in each created group
print(gb2.size())



Sex     Above/Below 50k
Female  <=50K               9592
        >50K                1179
Male    <=50K              15128
        >50K                6662
dtype: int64


In [37]:
# Print out the mean of each group for numeric columns only
print(gb2.mean(numeric_only=True))

                              Age         fnlgwt  Education Num  Capital Gain  \
Sex    Above/Below 50k                                                          
Female <=50K            36.210801  185999.381359       9.820475    121.986134   
       >50K             42.125530  183687.406277      11.787108   4200.389313   
Male   <=50K            37.147012  193093.609268       9.452142    165.723823   
       >50K             44.625788  188769.101321      11.580606   3971.765836   

                        Capital Loss  Hours/Week  
Sex    Above/Below 50k                            
Female <=50K               47.364470   35.916701  
       >50K               173.648855   40.426633  
Male   <=50K               56.806782   40.693879  
       >50K               198.780396   46.366106  


In [38]:
# Create a list of user-selected variables
user_list = ["Education", "Above/Below 50k"]

# Create a GroupBy object using this list
gb = adult.groupby(by=user_list)

# Find the mean for the variable "Hours/Week" for each group - Be efficient!
print(gb["Hours/Week"].mean())

Education     Above/Below 50k
10th          <=50K              36.574053
              >50K               43.774194
11th          <=50K              33.322870
              >50K               45.133333
12th          <=50K              35.035000
              >50K               44.818182
1st-4th       <=50K              37.864198
              >50K               48.833333
5th-6th       <=50K              38.539432
              >50K               46.000000
7th-8th       <=50K              38.830033
              >50K               47.500000
9th           <=50K              37.667351
              >50K               44.851852
Assoc-acdm    <=50K              39.264339
              >50K               44.256604
Assoc-voc     <=50K              40.817826
              >50K               43.853186
Bachelors     <=50K              40.586152
              >50K               45.475462
Doctorate     <=50K              45.429907
              >50K               47.513072
HS-grad       <=50K     

In [44]:
df = pd.DataFrame({
    'Animal': ['Dog', 'Cat', 'Dog', 'Cat'],
    'Speed': [40, 30, 35, 25]
})
print(df)
print()


for animal, group in df.groupby('Animal'):
    print(f"Animal: {animal}")
    print()
    print(group)

  Animal  Speed
0    Dog     40
1    Cat     30
2    Dog     35
3    Cat     25

Animal: Cat

  Animal  Speed
1    Cat     30
3    Cat     25
Animal: Dog

  Animal  Speed
0    Dog     40
2    Dog     35
