Mapping

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham', 'nova lox'],
                     'ounces':[4,3,12,6,7.5,8,3,5,6]})

meat_to_animal = {'bacon':'pig',
                  'pulled pork':'pig',
                  'pastrami':'cow',
                  'corned beef':'cow',
                  'honey ham':'pig',
                  'nova lox':'salmon'}

lowercased_df = df['food'].str.lower()

#map function, Mapping two column in this case using lowercased_df to map by meat_to_animal
df['animal'] = lowercased_df.map (meat_to_animal)

#lambda, a function, maths calculation
df['animal_alt'] = df['food'].map(lambda x: meat_to_animal[x.lower()])

print(df,'\n')
print(lowercased_df,'\n')
print(df['animal'],'\n')
print(df['animal_alt'])



          food  ounces  animal animal_alt
0        bacon     4.0     pig        pig
1  pulled pork     3.0     pig        pig
2        bacon    12.0     pig        pig
3     Pastrami     6.0     cow        cow
4  corned beef     7.5     cow        cow
5        Bacon     8.0     pig        pig
6     pastrami     3.0     cow        cow
7    honey ham     5.0     pig        pig
8     nova lox     6.0  salmon     salmon 

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object 

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: animal, dtype: object 

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: animal_alt, dtype: object


Replacing Values

In [2]:
dfnum = pd.Series([1.,-999,2.,-999,-1000.,3.])

print('Original Data')
print(dfnum,'\n')

print('Replace one variable')
df_replaced_1 = dfnum.replace(-999,np.nan)
print(df_replaced_1,'\n')

print('Replace Multiplace values')
df_replaced_2 = dfnum.replace([-999,-1000],np.nan)
print(df_replaced_2,'\n')

print('Different replacement for each value using list ')
df_replaced_3 = dfnum.replace([-999,-100],[np.nan,0])
print(df_replaced_3,'\n')

print('Arguments passed using dict')
df_replaced_4 = dfnum.replace ({-999:np.nan,-1000:0})
print(df_replaced_4,'\n')


Original Data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64 

Replace one variable
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64 

Replace Multiplace values
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64 

Different replacement for each value using list 
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64 

Arguments passed using dict
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64 



Renaming Axis Index

In [3]:
dfcity= pd.DataFrame(np.arange(12).reshape((3,4)),
index =['Ohio','Colorado','New York'],
columns=['one','two','three','four'])

transform = lambda x:x[:4].upper()
print (dfcity.index.map(transform))

dfcity.index = dfcity.index.map(transform)
print(dfcity.index)

print ('\n Original DataFrame')
print(dfcity)

print ('\n UpperCase')
dfcity_upper = dfcity.rename(index=str.title,columns=str.upper)
print(dfcity_upper)

print('\n Rename')
dfcity_rename = dfcity.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})
print(dfcity_rename.rename(index=str.title))


print('\n Rename + inplace=true')
print(dfcity)
dfcity_inplace = dfcity.rename(index={'OHIO':'INDIANA'},inplace=False)
print(dfcity_inplace)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')
Index(['OHIO', 'COLO', 'NEW '], dtype='object')

 Original DataFrame
      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11

 UpperCase
      ONE  TWO  THREE  FOUR
Ohio    0    1      2     3
Colo    4    5      6     7
New     8    9     10    11

 Rename
         one  two  peekaboo  four
Indiana    0    1         2     3
Colo       4    5         6     7
New        8    9        10    11

 Rename + inplace=true
      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11
         one  two  three  four
INDIANA    0    1      2     3
COLO       4    5      6     7
NEW        8    9     10    11


Discretization & Binning

In [4]:
ages = [20, 22, 25, 27, 21, 23, 37, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)

print(cats)
print("\n Codes:\n", cats.codes)
print("\n Categoreis:\n", cats.categories)
print("\n", pd.value_counts(cats)) 
print("\n")

right_closed_cats = pd.cut(ages, [18, 26, 36, 61, 100], right=False)

# Define custom bin names using the labels option
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
print (group_names)

# Apply pd.cut with custom bin names
custom_labeled_cats = pd.cut(ages, [18, 26, 36, 61, 100], labels=group_names)

print("\nBins with right-closed intervals (left side open):\n", right_closed_cats)
print("\n\nBins with custom labels:\n", custom_labeled_cats)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 11
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

 Codes:
 [0 0 0 1 0 0 2 3 2 2 1]

 Categoreis:
 IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

 (18, 25]     5
(35, 60]     3
(25, 35]     2
(60, 100]    1
dtype: int64


['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

Bins with right-closed intervals (left side open):
 [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [36, 61), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 11
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]


Bins with custom labels:
 ['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'MiddleAged', 'MiddleAged', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 11
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']


In [5]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2) 

[(0.24, 0.49], (0.73, 0.97], (0.73, 0.97], (0.49, 0.73], (0.24, 0.49], ..., (0.49, 0.73], (0.49, 0.73], (0.73, 0.97], (0.73, 0.97], (0.49, 0.73]]
Length: 20
Categories (4, interval[float64, right]): [(0.0016, 0.24] < (0.24, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

In [6]:
data = np.random.randn(10) # Normally distributed
print ("Data",data,"\n")
cats = pd.qcut(data, 4) # Cut into quartiles
print("Cat",cats)

Data [ 1.13382594 -0.081399   -1.69773669 -0.24104064 -0.01481209 -0.03716666
  1.15712933 -0.14676044 -0.08822829 -0.17384159] 

Cat [(-0.0204, 1.157], (-0.0848, -0.0204], (-1.6989999999999998, -0.167], (-1.6989999999999998, -0.167], (-0.0204, 1.157], (-0.0848, -0.0204], (-0.0204, 1.157], (-0.167, -0.0848], (-0.167, -0.0848], (-1.6989999999999998, -0.167]]
Categories (4, interval[float64, right]): [(-1.6989999999999998, -0.167] < (-0.167, -0.0848] < (-0.0848, -0.0204] < (-0.0204, 1.157]]


In [7]:
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
print ("Cat:",cats)

Cat: [(-0.0848, 1.136], (-0.0848, 1.136], (-1.6989999999999998, -0.387], (-0.387, -0.0848], (-0.0848, 1.136], (-0.0848, 1.136], (1.136, 1.157], (-0.387, -0.0848], (-0.387, -0.0848], (-0.387, -0.0848]]
Categories (4, interval[float64, right]): [(-1.6989999999999998, -0.387] < (-0.387, -0.0848] < (-0.0848, 1.136] < (1.136, 1.157]]
