In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(8)
arr = np.random.randint(0,100,25)

In [4]:
arr

array([67, 84,  5, 90,  8, 83, 63, 48, 85, 60, 49, 74, 27, 13,  9, 61, 15,
       93, 98, 59, 18, 14, 93, 56,  9])

In [5]:
#pandas series ~ numpy array
pd.Series(arr) #analogous to a vector / 1D array / single column 

0     67
1     84
2      5
3     90
4      8
5     83
6     63
7     48
8     85
9     60
10    49
11    74
12    27
13    13
14     9
15    61
16    15
17    93
18    98
19    59
20    18
21    14
22    93
23    56
24     9
dtype: int64

In [3]:
# creating a pandas dataframe => a series of pandas.Series / a series of columns
# create a 5 X 5 table from the arr above
df = pd.DataFrame(arr.reshape(5,5))
df

Unnamed: 0,0,1,2,3,4
0,67,84,5,90,8
1,83,63,48,85,60
2,49,74,27,13,9
3,61,15,93,98,59
4,18,14,93,56,9


In [4]:
df = pd.DataFrame(arr.reshape(5,5), index = np.arange(1,6), columns = 'A B C D E'.split())
df

Unnamed: 0,A,B,C,D,E
1,67,84,5,90,8
2,83,63,48,85,60
3,49,74,27,13,9
4,61,15,93,98,59
5,18,14,93,56,9


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       5 non-null      int64
 1   B       5 non-null      int64
 2   C       5 non-null      int64
 3   D       5 non-null      int64
 4   E       5 non-null      int64
dtypes: int64(5)
memory usage: 240.0 bytes


In [12]:
df.describe() #to check the descriptive stats

Unnamed: 0,A,B,C,D,E
count,5.0,5.0,5.0,5.0,5.0
mean,55.6,50.0,53.2,68.4,29.0
std,24.32694,33.24906,39.385276,34.789366,27.847801
min,18.0,14.0,5.0,13.0,8.0
25%,49.0,15.0,27.0,56.0,9.0
50%,61.0,63.0,48.0,85.0,9.0
75%,67.0,74.0,93.0,90.0,59.0
max,83.0,84.0,93.0,98.0,60.0


In [14]:
# Dataframe indexing almost similar to numpy
type(df['A'])

pandas.core.series.Series

In [15]:
df['A'] #This indexing must be the name of the column

1    67
2    83
3    49
4    61
5    18
Name: A, dtype: int64

In [17]:
df['B']

1    84
2    63
3    74
4    15
5    14
Name: B, dtype: int64

In [20]:
# you can also index pandas dataframe as how you would in SQL
df.A

1    67
2    83
3    49
4    61
5    18
Name: A, dtype: int64

In [21]:
# multidimension indexing => use double []
df[['A','C']]

Unnamed: 0,A,C
1,67,5
2,83,48
3,49,27
4,61,93
5,18,93


In [22]:
df.index

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [23]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [27]:
# To index rows instead of columns, use iloc
df.iloc[1]

A    83
B    63
C    48
D    85
E    60
Name: 2, dtype: int64

In [26]:
# iloc uses the python indexing, i.e starts from 0
# to use the row name, use loc
df.loc[1]

A    67
B    84
C     5
D    90
E     8
Name: 1, dtype: int64

In [5]:
df2 = pd.DataFrame(arr.reshape(5,5), index = ['one','two','three','four','five'], columns = 'A B C D E'.split())
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [29]:
df2.iloc[0]

A    67
B    84
C     5
D    90
E     8
Name: one, dtype: int64

In [30]:
df2.loc['one']

A    67
B    84
C     5
D    90
E     8
Name: one, dtype: int64

In [6]:
# creating a new column
df['A+B'] = df['A'] + df['B']
df

Unnamed: 0,A,B,C,D,E,A+B
1,67,84,5,90,8,151
2,83,63,48,85,60,146
3,49,74,27,13,9,123
4,61,15,93,98,59,76
5,18,14,93,56,9,32


In [32]:
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [8]:
#create a copy of dataframe to not lose its original content
df3 = df2.copy()

In [9]:
df3['A+B'] = df3['A'] + df3['B']

In [10]:
df3

Unnamed: 0,A,B,C,D,E,A+B
one,67,84,5,90,8,151
two,83,63,48,85,60,146
three,49,74,27,13,9,123
four,61,15,93,98,59,76
five,18,14,93,56,9,32


In [36]:
df2

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [11]:
#deleting a column
df3.drop('A+B',axis =1)

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [12]:
#even after dropping the column will still be around for data persistence
df3

Unnamed: 0,A,B,C,D,E,A+B
one,67,84,5,90,8,151
two,83,63,48,85,60,146
three,49,74,27,13,9,123
four,61,15,93,98,59,76
five,18,14,93,56,9,32


In [40]:
#to drop permanently, use inplace=True
df3.drop('A+B', axis=1, inplace=True)

In [13]:
#another way of dropping the column permanently, by re-assigning to the original variable
df3 = df3.drop('A+B', axis=1)

In [14]:
df3

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
three,49,74,27,13,9
four,61,15,93,98,59
five,18,14,93,56,9


In [16]:
# Filtering in pandas ~ numpy filtering
df3[df3>50]

Unnamed: 0,A,B,C,D,E
one,67.0,84.0,,90.0,
two,83.0,63.0,,85.0,60.0
three,,74.0,,,
four,61.0,,93.0,98.0,59.0
five,,,93.0,56.0,


NaN => Not a Number / False in Numpy

In [18]:
# column wise filtering. For example: filter items in column A > 50
df3[df3['A'] > 50]

Unnamed: 0,A,B,C,D,E
one,67,84,5,90,8
two,83,63,48,85,60
four,61,15,93,98,59


In [19]:
df3[(df3['A'] > 50) & (df3['C'] > 50)]

Unnamed: 0,A,B,C,D,E
four,61,15,93,98,59


## GroupBy

In [20]:
data = {'Company' : ["GOOG", "GOOG", "MSFT", "FB", "MSFT", "FB"],
        'Person' : ['Sam', 'Xian Wei', 'Charlie', 'Teddy', 'Sarah', 'Jaslyne'],
        'Sales' : [200,100,120, 340, 881, 350]}

df_groups = pd.DataFrame(data)
df_groups

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Xian Wei,100
2,MSFT,Charlie,120
3,FB,Teddy,340
4,MSFT,Sarah,881
5,FB,Jaslyne,350


In [21]:
# to do groupby => you must specify the type of aggregation
# min, max, mean will only apply to numbers
df_groups.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,345.0
GOOG,150.0
MSFT,500.5


In [22]:
df_groups.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,345.0,7.071068,340.0,342.5,345.0,347.5,350.0
GOOG,2.0,150.0,70.710678,100.0,125.0,150.0,175.0,200.0
MSFT,2.0,500.5,538.10826,120.0,310.25,500.5,690.75,881.0


In [23]:
# Count => counts for non-null values within columns
df_groups.groupby('Company').count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


## Missing / Null values

In [32]:
data2 = {'Company' : ["GOOG", "GOOG", "MSFT", "FB", "MSFT", "FB"],
        'Person' : ['Sam', 'Xian Wei', 'Charlie', 'Teddy', 'Sarah', 'Jaslyne'],
        'Sales' : [200,100,None, 340, 881, 350]}

df_groups2 = pd.DataFrame(data2)
df_groups2

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200.0
1,GOOG,Xian Wei,100.0
2,MSFT,Charlie,
3,FB,Teddy,340.0
4,MSFT,Sarah,881.0
5,FB,Jaslyne,350.0


In [25]:
df_groups2.groupby('Company').count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,1


In [26]:
df_groups2.transpose()

Unnamed: 0,0,1,2,3,4,5
Company,GOOG,GOOG,MSFT,FB,MSFT,FB
Person,Sam,Xian Wei,Charlie,Teddy,Sarah,Jaslyne
Sales,200.0,100.0,,340.0,881.0,350.0


In [28]:
df_groups2.groupby('Sales').sum()

Unnamed: 0_level_0,Company,Person
Sales,Unnamed: 1_level_1,Unnamed: 2_level_1
100.0,GOOG,Xian Wei
200.0,GOOG,Sam
340.0,FB,Teddy
350.0,FB,Jaslyne
881.0,MSFT,Sarah


In [30]:
df_groups2['Sales'].sum()

1871.0

In [33]:
# groupby.agg method
df_groups2.groupby('Company').agg({
    'Person' : lambda x: x.count(),
    'Sales' : lambda x: x.sum()
})

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,690.0
GOOG,2,300.0
MSFT,2,881.0


## Pandas I/O

In [34]:
df_csv = pd.read_csv('example1.csv')
df_csv

Unnamed: 0,customer_id,order_id,amount,discount
0,1,699,144.856817,0.2
1,2,983,41.862403,0.1
2,3,598,683.858244,0.3
3,4,872,22.142205,0.0
4,5,549,145.634276,0.05
5,6,461,527.632459,0.1
6,7,560,71.270597,0.1
7,8,318,21.1466,0.1
8,9,752,881.255706,0.1
9,10,536,641.685019,0.1


In [36]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   customer_id  15 non-null     int64  
 1   order_id     15 non-null     int64  
 2   amount       15 non-null     float64
 3   discount     15 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 608.0 bytes


In [37]:
df_csv.describe()

Unnamed: 0,customer_id,order_id,amount,discount
count,15.0,15.0,15.0,15.0
mean,8.0,658.866667,388.561335,0.123333
std,4.472136,181.416201,310.609179,0.082086
min,1.0,318.0,21.1466,0.0
25%,4.5,554.5,108.063707,0.1
50%,8.0,622.0,515.894967,0.1
75%,11.5,750.0,627.658608,0.1
max,15.0,983.0,881.255706,0.3


## Pandas exercise - Salaries data -> Exploratory Data Analysis (EDA)

In [49]:
#read the csv
df_data = pd.read_csv('Salaries.csv')
df_data

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.00,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.60,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.00,56120.71,198306.90,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.60,9737.00,182234.59,,326373.19,326373.19,2011,,San Francisco,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148649,148650,Roy I Tillery,Custodian,0.00,0.00,0.00,0.0,0.00,0.00,2014,,San Francisco,
148650,148651,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148651,148652,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148652,148653,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,


In [39]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148654 non-null  int64  
 1   EmployeeName      148654 non-null  object 
 2   JobTitle          148654 non-null  object 
 3   BasePay           148045 non-null  float64
 4   OvertimePay       148650 non-null  float64
 5   OtherPay          148650 non-null  float64
 6   Benefits          112491 non-null  float64
 7   TotalPay          148654 non-null  float64
 8   TotalPayBenefits  148654 non-null  float64
 9   Year              148654 non-null  int64  
 10  Notes             0 non-null       float64
 11  Agency            148654 non-null  object 
 12  Status            0 non-null       float64
dtypes: float64(8), int64(2), object(3)
memory usage: 14.7+ MB


In [41]:
df_data.describe()

Unnamed: 0,Id,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Status
count,148654.0,148045.0,148650.0,148650.0,112491.0,148654.0,148654.0,148654.0,0.0,0.0
mean,74327.5,66325.448841,5066.059886,3648.767297,25007.893151,74768.321972,93692.554811,2012.522643,,
std,42912.857795,42764.635495,11454.380559,8056.601866,15402.215858,50517.005274,62793.533483,1.117538,,
min,1.0,-166.01,-0.01,-7058.59,-33.89,-618.13,-618.13,2011.0,,
25%,37164.25,33588.2,0.0,0.0,11535.395,36168.995,44065.65,2012.0,,
50%,74327.5,65007.45,0.0,811.27,28628.62,71426.61,92404.09,2013.0,,
75%,111490.75,94691.05,4658.175,4236.065,35566.855,105839.135,132876.45,2014.0,,
max,148654.0,319275.01,245131.88,400184.25,96570.66,567595.43,567595.43,2014.0,,


In [44]:
# make ID as index
df_data.index

RangeIndex(start=0, stop=148654, step=1)

In [50]:
df_data = df_data.set_index('Id')
df_data

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.00,400184.25,,567595.43,567595.43,2011,,San Francisco,
2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.60,,335279.91,335279.91,2011,,San Francisco,
4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.00,56120.71,198306.90,,332343.61,332343.61,2011,,San Francisco,
5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.60,9737.00,182234.59,,326373.19,326373.19,2011,,San Francisco,
...,...,...,...,...,...,...,...,...,...,...,...,...
148650,Roy I Tillery,Custodian,0.00,0.00,0.00,0.0,0.00,0.00,2014,,San Francisco,
148651,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148652,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148653,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,


In [48]:
df_data.describe()

Unnamed: 0,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Status
count,148045.0,148650.0,148650.0,112491.0,148654.0,148654.0,148654.0,0.0,0.0
mean,66325.448841,5066.059886,3648.767297,25007.893151,74768.321972,93692.554811,2012.522643,,
std,42764.635495,11454.380559,8056.601866,15402.215858,50517.005274,62793.533483,1.117538,,
min,-166.01,-0.01,-7058.59,-33.89,-618.13,-618.13,2011.0,,
25%,33588.2,0.0,0.0,11535.395,36168.995,44065.65,2012.0,,
50%,65007.45,0.0,811.27,28628.62,71426.61,92404.09,2013.0,,
75%,94691.05,4658.175,4236.065,35566.855,105839.135,132876.45,2014.0,,
max,319275.01,245131.88,400184.25,96570.66,567595.43,567595.43,2014.0,,


In [51]:
# making a copy
salaries_df = df_data.copy()
salaries_df

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.00,400184.25,,567595.43,567595.43,2011,,San Francisco,
2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.60,,335279.91,335279.91,2011,,San Francisco,
4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.00,56120.71,198306.90,,332343.61,332343.61,2011,,San Francisco,
5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.60,9737.00,182234.59,,326373.19,326373.19,2011,,San Francisco,
...,...,...,...,...,...,...,...,...,...,...,...,...
148650,Roy I Tillery,Custodian,0.00,0.00,0.00,0.0,0.00,0.00,2014,,San Francisco,
148651,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148652,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,
148653,Not provided,Not provided,,,,,0.00,0.00,2014,,San Francisco,


In [52]:
#average base pay
salaries_df['BasePay'].mean()

66325.44884050643

In [55]:
#Highest amount of  overtime pay in the dataset
salaries_df['OvertimePay'].max()

245131.88

In [57]:
#What is the job title of JOSEPH DRISCOLL
salaries_df[salaries_df['EmployeeName'] == 'JOSEPH DRISCOLL']['JobTitle']

Id
25    CAPTAIN, FIRE SUPPRESSION
Name: JobTitle, dtype: object

In [58]:
#How much does JOSEPH DRISCOLL make (including benefits)?
salaries_df[salaries_df['EmployeeName'] == 'JOSEPH DRISCOLL']['TotalPayBenefits']

Id
25    270324.91
Name: TotalPayBenefits, dtype: float64

In [61]:
#What is the name of lowest paid person (including benefits)?
salaries_df[salaries_df['TotalPayBenefits'] == salaries_df['TotalPayBenefits'].min()]['EmployeeName']

Id
148654    Joe Lopez
Name: EmployeeName, dtype: object

In [62]:
#What is the name and job title of highest paid person (including benefits)?
salaries_df[salaries_df['TotalPayBenefits'] == salaries_df['TotalPayBenefits'].max()][['EmployeeName','JobTitle']]

Unnamed: 0_level_0,EmployeeName,JobTitle
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY


In [77]:
#How many unique job titles are there?
salaries_df['JobTitle'].nunique()

2159

In [74]:
#What was the average (mean) BasePay of all employees per year? (2011-2014) ?
salaries_df.groupby('Year').mean()["BasePay"]

Year
2011    63595.956517
2012    65436.406857
2013    69630.030216
2014    66564.421924
Name: BasePay, dtype: float64

In [70]:
salaries_df.head()

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [69]:
salaries_df.tail()

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
148650,Roy I Tillery,Custodian,0.0,0.0,0.0,0.0,0.0,0.0,2014,,San Francisco,
148651,Not provided,Not provided,,,,,0.0,0.0,2014,,San Francisco,
148652,Not provided,Not provided,,,,,0.0,0.0,2014,,San Francisco,
148653,Not provided,Not provided,,,,,0.0,0.0,2014,,San Francisco,
148654,Joe Lopez,"Counselor, Log Cabin Ranch",0.0,0.0,-618.13,0.0,-618.13,-618.13,2014,,San Francisco,


In [79]:
#What are the top 5 most common jobs?
#using value_counts() will automatically sort the numbers by descending order and head(5) 
salaries_df['JobTitle'].value_counts().head()

Transit Operator                7036
Special Nurse                   4389
Registered Nurse                3736
Public Svc Aide-Public Works    2518
Police Officer 3                2421
Name: JobTitle, dtype: int64

In [82]:
#How many Job Titles were represented by only one person in 2013? (e.g. Job Titles with only one occurence in 2013?)
sum(salaries_df[salaries_df['Year'] == 2013]['JobTitle'].value_counts() == 1)

202

In [84]:
#How many people have the word Chief in their job title? => lambda function
sum(salaries_df['JobTitle'].apply(lambda x: 'chief' in x.lower()))

627