In [1]:
import pandas as pd

In [2]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


# Common string methods `.lower()`, `.upper()`, `.title()` and `.len()`

In [4]:
chicago["Name"] = chicago["Name"].str.title()
chicago["Position Title"] = chicago["Position Title"].str.title()

In [5]:
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"Zygadlo, Michael J",Frm Of Machinists - Automotive,GENERAL SERVICES,$99528.00
32058,"Zygowicz, Peter J",Police Officer,POLICE,$87384.00
32059,"Zymantas, Mark E",Police Officer,POLICE,$84450.00
32060,"Zyrkowski, Carlo E",Police Officer,POLICE,$87384.00
32061,"Zyskowski, Dariusz",Chief Data Base Analyst,DoIT,$113664.00


In [6]:
chicago["Department"] = (
    chicago["Department"].str.replace("MGMNT", "MANAGEMENT").str.title()
)

In [7]:
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)

In [8]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
1,"Aaron, Jeffery M",Police Officer,Police,84450.0
2,"Aaron, Karina",Police Officer,Police,84450.0
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0


In [9]:
mask = chicago["Position Title"].str.lower().str.contains("water")
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
554,"Aluise, Vincent G",Foreman Of Water Pipe Construction,Water Management,102440.0
671,"Ander, Perry A",Water Chemist Ii,Water Management,82044.0
685,"Anderson, Andrew J",District Superintendent Of Water Distribution,Water Management,109272.0
702,"Anderson, Donald",Foreman Of Water Pipe Construction,Water Management,102440.0
...,...,...,...,...
29669,"Verma, Anupam",Managing Engineer - Water Management,Water Management,111192.0
30239,"Washington, Joseph",Water Chemist Iii,Water Management,89676.0
30544,"West, Thomas R",Gen Supt Of Water Management,Water Management,115704.0
30991,"Williams, Matthew",Foreman Of Water Pipe Construction,Water Management,102440.0


In [10]:
mask2 = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask2]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
671,"Ander, Perry A",Water Chemist Ii,Water Management,82044.0
1054,"Ashley, Karma T",Water Chemist Ii,Water Management,82044.0
1079,"Atkins, Joanna M",Water Chemist Ii,Water Management,82044.0
1181,"Azeem, Mohammed A",Water Chemist Ii,Water Management,53172.0
...,...,...,...,...
28574,"Threatt, Denise R",Water Quality Inspector,Water Management,62004.0
28602,"Tignor, Darryl B",Water Rate Taker,Water Management,78948.0
28955,"Travis Cook, Leslie R",Water Rate Taker,Water Management,78948.0
29584,"Velazquez, John",Water Rate Taker,Water Management,78948.0


In [11]:
mask3 = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask3]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"Afroz, Nayyar",Psychiatrist,Health,99840.0
308,"Alarcon, Luis J",Loan Processing Specialist,Community Development,81948.0
422,"Allain, Carolyn",Senior Telecommunications Specialist,Doit,89880.0
472,"Allen, Robert",Machinist,Water Management,94328.0
705,"Anderson, Edward M",Sr Procurement Specialist,Procurement,91476.0
...,...,...,...,...
31667,"Yoder, Teresa G",Archival Specialist,Public Library,74304.0
31688,"Youngbloom, Laurence G",Crimes Surveillance Specialist,Oemc,19676.8
31717,"Young, Kimberly M",Sr Procurement Specialist,Procurement,68556.0
31837,"Zapata, Hugo",Sr Procurement Specialist,Procurement,87324.0


# More DataFrame String Methods - `strip`, `lstrip`, and `rstrip`

In [12]:
chicago = pd.read_csv("chicago.csv", index_col="Name").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago["Employee Annual Salary"].mean()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [13]:
chicago.index = (
    chicago.index.str.strip().str.replace(r"\s+", " ", regex=True).str.title()
)
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago["Department"] = (
    chicago["Department"]
    .str.lower()
    .str.replace(r"mgmnt", "management", regex=True)
    .str.title()
)

In [14]:
chicago.columns = chicago.columns.str.upper()

In [15]:
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
"Aaron, Jeffery M",Police Officer,Police,84450.0
"Aaron, Karina",Police Officer,Police,84450.0
"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0
"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0


In [16]:
"Hello my name is Javier".split()

['Hello', 'my', 'name', 'is', 'Javier']

# Split Strings by Characters with the `.str.split()` Method

In [17]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [18]:
chicago["Name"].str.split(",").str.get(0).str.title().value_counts()

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
            ... 
Horkavy        1
Horn           1
Horne Jr       1
Horner         1
Zyskowski      1
Name: Name, Length: 13829, dtype: int64

In [19]:
chicago["Position Title"].str.split().str.get(0).value_counts()

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
DENTIST                1
ASSOC                  1
TELEPHONE              1
MAYOR                  1
PREPRESS               1
Name: Position Title, Length: 320, dtype: int64

# More Practice with the `split()` method

In [20]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [21]:
# get the most common first name
chicago["Name"].str.split(",").str.get(1).str.split().str.get(0).value_counts().head(5)

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

# Exploring the `expand` and `n` Parameters of the `str.split()` Method

In [22]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [23]:
chicago[["First Name", "Last Name"]] = chicago["Name"].str.split(",", expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0,ABAD JR,VICENTE M


In [24]:
chicago[["First Title Word", "Remaining Title Words"]] = chicago[
    "Position Title"
].str.split(expand=True, n=1)

In [25]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Title Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0,ABAD JR,VICENTE M,CIVIL,ENGINEER IV


In [26]:
chicago["Position Title"].str.split().str.get(0).value_counts()

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
DENTIST                1
ASSOC                  1
TELEPHONE              1
MAYOR                  1
PREPRESS               1
Name: Position Title, Length: 320, dtype: int64

# More Practice with the `split()` method

In [27]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [28]:
# get the most common first name
chicago["Name"].str.split(",").str.get(1).str.split().str.get(0).value_counts().head(5)

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

# Exploring the `expand` and `n` Parameters of the `str.split()` Method

In [29]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago["Employee Annual Salary"] = (
    chicago["Employee Annual Salary"].str.replace(r"\$", "", regex=True).astype("float")
)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0


In [30]:
chicago[["First Name", "Last Name"]] = chicago["Name"].str.split(",", expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0,ABAD JR,VICENTE M


In [31]:
chicago[["First Title Word", "Remaining Title Words"]] = chicago[
    "Position Title"
].str.split(expand=True, n=1)

In [32]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Title Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,90744.0,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,106836.0,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
