In [1]:
import pandas as pd

# Usando métodos de string dentro de um DF
- É possível utilizar métodos de string dentro de uma coluna de um DF
- Para isso, temos que sempre chamar `.str` e o método/função de manipulação de string que desejamos
- Para mostrarmos alguns exemplos, vamos carregar nosso dataset

In [2]:
chicago = pd.read_csv("../data/chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/chicago.csv'

- Convertendo todos os nomes para minusculo

In [12]:
chicago["Name"] = chicago["Name"].str.lower()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"aaron, jeffery m",POLICE OFFICER,POLICE,$84450.00
2,"aaron, karina",POLICE OFFICER,POLICE,$84450.00
3,"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"abad jr, vicente m",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


- Trocando todas as ocorrências de `MGMNT` por `MANAGEMENT`

In [13]:
chicago["Department"] = chicago["Department"].str.replace("MGMNT", "MANAGEMENT")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"aaron, jeffery m",POLICE OFFICER,POLICE,$84450.00
2,"aaron, karina",POLICE OFFICER,POLICE,$84450.00
3,"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"abad jr, vicente m",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


- A coluna do salario anual está como string porque tem um `$`
- Vamos remover ele e transformar ela em float

In [14]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$", "").astype(float)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"aaron, jeffery m",POLICE OFFICER,POLICE,84450.0
2,"aaron, karina",POLICE OFFICER,POLICE,84450.0
3,"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"abad jr, vicente m",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


- Criando um filtro pra procurar posições que contem a palavra `WATER`

In [16]:
mask_water = chicago["Position Title"].str.contains("WATER")
chicago[mask_water].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MANAGEMENT,90744.0
554,"aluise, vincent g",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0
671,"ander, perry a",WATER CHEMIST II,WATER MANAGEMENT,82044.0
685,"anderson, andrew j",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MANAGEMENT,109272.0
702,"anderson, donald",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0


- Criando um filtro que procura posições que **começam** com `WATER`

In [17]:
mask_water = chicago["Position Title"].str.startswith("WATER")
chicago[mask_water].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"aaron, elvia j",WATER RATE TAKER,WATER MANAGEMENT,90744.0
671,"ander, perry a",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1054,"ashley, karma t",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1079,"atkins, joanna m",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1181,"azeem, mohammed a",WATER CHEMIST II,WATER MANAGEMENT,53172.0


- Criando um filtro para posições que terminam com `SPECIALIST`

In [18]:
mask_spe = chicago["Position Title"].str.endswith("SPECIALIST")
chicago[mask_spe].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
308,"alarcon, luis j",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,81948.0
422,"allain, carolyn",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,89880.0
705,"anderson, edward m",SR PROCUREMENT SPECIALIST,PROCUREMENT,91476.0
1163,"ayala jr, juan",FIELD SANITATION SPECIALIST,STREETS & SAN,78948.0
1558,"barrett, barbara j",TECHNICAL TRAINING SPECIALIST,POLICE,94200.0


- Também podemos aplicar o método para os index

In [21]:
chicago = chicago.set_index("Name")
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"aaron, elvia j",WATER RATE TAKER,WATER MANAGEMENT,90744.0
"aaron, jeffery m",POLICE OFFICER,POLICE,84450.0
"aaron, karina",POLICE OFFICER,POLICE,84450.0
"aaron, kimberlei r",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"abad jr, vicente m",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [23]:
chicago.index = chicago.index.str.upper()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


- O mesmo pode ser feito para as colunas

In [25]:
chicago.columns = chicago.columns.str.upper()
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


- Obtendo os 3 nomes mais comuns usando os métodos `split` e `value_counts`:

In [30]:
chicago.index.str.split(",").str.get(0).value_counts()

WILLIAMS        293
JOHNSON         244
SMITH           241
BROWN           185
JONES           183
               ... 
VASILOPOULOS      1
SAMARRIPA         1
STRAINIS          1
BOUCK             1
JUDON             1
Name: Name, Length: 13829, dtype: int64