In [1]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 65)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# Creating Categoricals

In [2]:
"""
A categorical variable is a type of variable in statistics that represents a limited and often
fixed set of values. This is in contrast to continuous variables, which can represent an
infinite number of values. Common types of categorical variables include gender (where
there are two values, male and female) or blood types (which can be one of the small sets of
types of blood, such as A, B, and O).
"""
    #create categorical directly from a list
lmh_values = ["low", "high", "medium", "medium", "high"]
lmh_cat = pd.Categorical(lmh_values)
lmh_cat

[low, high, medium, medium, high]
Categories (3, object): [high, low, medium]

In [3]:
# examine the categories
lmh_cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [5]:
# retreive the values
lmh_cat

[low, high, medium, medium, high]
Categories (3, object): [high, low, medium]

In [7]:
# .codes shows the integer mapping for each value of the categorical [low, high, medium, medium, high]
lmh_cat.codes #in this case low: 1; medium =2; high = 0

array([1, 0, 2, 2, 0], dtype=int8)

In [10]:
#the oder can be controlled by specifying the categories using the categories parameter
lmh_cat = pd.Categorical(lmh_values, categories=['low','medium','high'])
lmh_cat.codes

array([0, 2, 1, 1, 2], dtype=int8)

In [11]:
#sorting categorical
lmh_cat.sort_values()

[low, medium, medium, high, high]
Categories (3, object): [low, medium, high]

In [16]:
#categorical variable can also be represented as a series by using dtype
cat_series = pd.Series(lmh_values, dtype="category")
lmh_cat

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [15]:
# first creating Series and then convert the column into Categories using .astype
s = pd.Series(lmh_values)
lmh_cat = s.astype("category") 
lmh_cat

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): [high, low, medium]

In [18]:
# a categorical has a .cat property that lets you access info
cat_series.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x0000023C76FF72C8>

In [17]:
#Access to various information of series category by using .cat property
cat_series.cat.categories #return categories

Index(['high', 'low', 'medium'], dtype='object')

In [19]:
# create a DataFrame of 100 values
np.random.seed(12345)
values = np.random.randint(0, 100, 5) # values in range 0 to 100; have 5 elements
bins = pd.DataFrame({"Values": values})
bins

   Values
0      98
1      29
2       1
3      36
4      41

In [22]:
# cut the 5 value range 0 to 100 into 5 categorical bins, each 10 integers wide:
bins['Group'] = pd.cut(values, range(0,101,10))
bins

   Values      Group
0      98  (90, 100]
1      29   (20, 30]
2       1    (0, 10]
3      36   (30, 40]
4      41   (40, 50]

In [23]:
#the group column represents a cetgorial variable, as created by the cut function
bins.Group

0    (90, 100]
1     (20, 30]
2      (0, 10]
3     (30, 40]
4     (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [25]:
#explicit ordering of the categories can be specified by using order = True
metal_val = ["bronze", "gold", "silver", "bronze"]
metal_categories = ["bronze", "silver", "gold"]
metal =pd.Categorical(metal_val,
                      categories=metal_categories,
                      ordered=True) # display < > in categories
metal

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

In [28]:
# compare one Categorical to another
metals_reversed_values = pd.Categorical(
    metal[::-1],
    categories = metal.categories, 
    ordered=True)
metals_reversed_values

[bronze, silver, gold, bronze]
Categories (3, object): [bronze < silver < gold]

In [30]:
#Compare two categories
metal <= metals_reversed_values

array([ True, False,  True,  True])

In [32]:
#Create a value that not given in categories
new_metal = pd.Categorical(['silver', 'cooper'],       # 'cooper' doesn't have in metal_categories => return NaN
                           categories=metal_categories)
new_metal

[silver, NaN]
Categories (3, object): [bronze, silver, gold]

# Renaming Categories

In [33]:
cat = pd.Categorical(["a","b","c","a"],
                    categories=["a", "b", "c"])
cat

[a, b, c, a]
Categories (3, object): [a, b, c]

In [35]:
# renames the categories so that the name of categorical series also change 
cat.categories = ["bronze", "silver", "gold"] # note a => bronze
cat

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

In [38]:
# this also renames 
new_cat = cat.rename_categories(["x", "y", "z"])
new_cat

[x, y, z, x]
Categories (3, object): [x, y, z]

In [37]:
# the rename is not done in-place
cat

[bronze, silver, gold, bronze]
Categories (3, object): [bronze, silver, gold]

# Appending new categories

In [40]:
# add a new platimnum category
with_platinum = metal.add_categories(["platinum"])
with_platinum

[bronze, gold, silver, bronze]
Categories (4, object): [bronze < silver < gold < platinum]

# Removing Categories

In [42]:
# remove bronze category
no_bronze = metal.remove_categories(["bronze"])
no_bronze

[NaN, gold, silver, NaN]
Categories (2, object): [silver < gold]

# Removing unused categories

In [43]:
# remove any unused categories (in this case, platinum doest show up in categories Series)
with_platinum.remove_unused_categories()

[bronze, gold, silver, bronze]
Categories (3, object): [bronze < silver < gold]

# Setting categories

In [45]:
# Setting Categories/ add and remove categories in one step: using .set_categories()
    # sample Series
s = pd.Series(["one","two","four", "five", "two"], dtype="category")
s

0     one
1     two
2    four
3    five
4     two
dtype: category
Categories (4, object): [five, four, one, two]

In [46]:
# remove the "two", "three" and "five" categories (replaced with NaN)
s = s.cat.set_categories(["one","four"]) # # the result has NaN replacing for the categories that now do not exist
s

0     one
1     NaN
2    four
3     NaN
4     NaN
dtype: category
Categories (2, object): [one, four]

# Describe

In [48]:
# get descriptive info on the metals categorical
metal.describe() # bronze xuat hien 2 lan = > 0.5

            counts  freqs
categories               
bronze           2   0.50
silver           1   0.25
gold             1   0.25

# Value counts

In [50]:
# count the values in the categorical
metal.value_counts()

bronze    2
silver    1
gold      1
dtype: int64

# Minimum, maximum and mode

In [52]:
# find the min, max and mode of the metals categorical
(metal.min(), metal.max(), metal.mode())

('bronze',
 'gold',
 [bronze]
 Categories (3, object): [bronze < silver < gold])

# EXAMPLE

In [54]:
# Organize information based on categories instead of numbers.
# Assigning a grade to a student base on ten numeric grade
    # 10 students with random grades
np.random.seed(123456)
names =['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol',
         'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
score = pd.DataFrame({'Name': names, "Grade": grades}) #represent raw score for each of the students
score

     Name  Grade
0   Ivana     51
1  Norris     92
2    Ruth    100
3    Lane     99
4    Skye     93
5     Sol     97
6   Dylan     93
7  Katina     77
8  Alissa     82
9    Marc     73

In [55]:
# bins and their mappings to letter grades/ define the bins for each grade and the associated letter for each bin.
score_bins    = [ 0,  59,   62,  66,   69,   72,  76,   79,   82, 86,   89,   92,  99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

In [58]:
#Cut based upon the bins and assign the letter grade
letter_cats = pd.cut(score.Grade, score_bins, labels=letter_grades)   #create a letter_grades Series  from score.Grade Series
              #score.Grade: series want to convert to leter_grades (diem = chu cai) series
              #score_bins: list score reference link alignment with letter_grades reference
              #letter_grades: list grades reference liknk alight with score_bins
letter_cats

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Grade, dtype: category
Categories (13, object): [F < D- < D < D+ ... B+ < A- < A < A+]

In [59]:
score['Letter'] = letter_cats
score

     Name  Grade Letter
0   Ivana     51      F
1  Norris     92     A-
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
7  Katina     77     C+
8  Alissa     82     B-
9    Marc     73      C

In [60]:
 #Look at the code
# examine the underlying categorical
letter_cats

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Grade, dtype: category
Categories (13, object): [F < D- < D < D+ ... B+ < A- < A < A+]

In [61]:
# how many of each grade occurred?
score.Letter.value_counts()

A     4
A+    1
A-    1
B-    1
C+    1
     ..
B     0
C-    0
D+    0
D     0
D-    0
Name: Letter, Length: 13, dtype: int64

In [62]:
# and sort by letter grade instead of numeric grade
score.sort_values(by=['Letter'], ascending=False)

     Name  Grade Letter
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
1  Norris     92     A-
8  Alissa     82     B-
7  Katina     77     C+
9    Marc     73      C
0   Ivana     51      F