# Manipulating text in DataFrames
Pandas calls it _manipulating textual data_, and text is one of the most predominant data types you will encounter in datasets besides integers, floats, and booleans.
Being able to process and manipulate text is useful when you need to normalize data in cells

In [None]:
# Load your dataframe
import pandas as pd
csv_url = "https://raw.githubusercontent.com/paiml/wine-ratings/main/wine-ratings.csv"
df = pd.read_csv(csv_url, index_col=0)
df.head()

In [32]:
ndf = pd.read_csv("us_births_2016_2021.csv", index_col=0)
ndf.head(10)

Unnamed: 0_level_0,State Abbreviation,Year,Gender,Education Level of Mother,Education Level Code,Number of Births,Average Age of Mother (years),Average Birth Weight (g)
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,AL,2016,F,8th grade or less,1,1052,27.8,3116.9
Alabama,AL,2016,F,9th through 12th grade with no diploma,2,3436,24.1,3040.0
Alabama,AL,2016,F,High school graduate or GED completed,3,8777,25.4,3080.0
Alabama,AL,2016,F,"Some college credit, but not a degree",4,6453,26.7,3121.9
Alabama,AL,2016,F,"Associate degree (AA, AS)",5,2227,28.9,3174.3
Alabama,AL,2016,F,"Bachelor's degree (BA, AB, BS)",6,4453,30.3,3239.0
Alabama,AL,2016,F,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",7,1910,32.0,3263.5
Alabama,AL,2016,F,"Doctorate (PhD, EdD) or Professional Degree (M...",8,487,33.1,3196.7
Alabama,AL,2016,F,Unknown or Not Stated,-9,65,27.7,3083.9
Alabama,AL,2016,M,8th grade or less,1,1188,27.6,3232.9


In [33]:
# manipulate the variety to be R for red or W for white
df["variety_short"] = df["variety"].replace({"Red Wine": "R", "White Wine": "W"})
df.head()

Unnamed: 0,name,grape,region,variety,rating,notes,variety_short,region_short
0,1000 Stories Bourbon Barrel Aged Batch Blue Ca...,,"Mendocino, California",Red Wine,91.0,"This is a very special, limited release of 100...",R,California
1,1000 Stories Bourbon Barrel Aged Gold Rush Red...,,California,Red Wine,89.0,The California Gold Rush was a period of coura...,R,California
2,1000 Stories Bourbon Barrel Aged Gold Rush Red...,,California,Red Wine,90.0,The California Gold Rush was a period of coura...,R,California
3,1000 Stories Bourbon Barrel Aged Zinfandel 2013,,"North Coast, California",Red Wine,91.0,"The wine has a deep, rich purple color. An int...",R,California
4,1000 Stories Bourbon Barrel Aged Zinfandel 2014,,California,Red Wine,90.0,Batch #004 is the first release of the 2014 vi...,R,California


In [34]:
# list of all the unique values in the Education Level of Mother column
ndf["Education Level of Mother"].value_counts()

8th grade or less                                                      612
9th through 12th grade with no diploma                                 612
High school graduate or GED completed                                  612
Some college credit, but not a degree                                  612
Associate degree (AA, AS)                                              612
Bachelor's degree (BA, AB, BS)                                         612
Master's degree (MA, MS, MEng, MEd, MSW, MBA)                          612
Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)    612
Unknown or Not Stated                                                  600
Name: Education Level of Mother, dtype: int64

In [35]:
# recoding the values in education level of mother
ndf["Education Level"] = ndf["Education Level of Mother"].replace({"8th grade or less": "less than high school", "9th through 12th grade with no diploma": "less than high school", "High school graduate or GED completed": "High school graduate or GED completed", "Some college credit, but not a degree": "Some college", "Associate degree": "Associate degree", "Bachelor's degree": "Bachelor's degree", "Master's degree (MA, MS, MEng, MEd, MSW, MBA)": "Advanced degree", "Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)": "Advanced degree", "Unknown or Not Stated": "Unknown or Not Stated"})
ndf.head(15)

Unnamed: 0_level_0,State Abbreviation,Year,Gender,Education Level of Mother,Education Level Code,Number of Births,Average Age of Mother (years),Average Birth Weight (g),Education Level
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,AL,2016,F,8th grade or less,1,1052,27.8,3116.9,less than high school
Alabama,AL,2016,F,9th through 12th grade with no diploma,2,3436,24.1,3040.0,less than high school
Alabama,AL,2016,F,High school graduate or GED completed,3,8777,25.4,3080.0,High school graduate or GED completed
Alabama,AL,2016,F,"Some college credit, but not a degree",4,6453,26.7,3121.9,Some college
Alabama,AL,2016,F,"Associate degree (AA, AS)",5,2227,28.9,3174.3,"Associate degree (AA, AS)"
Alabama,AL,2016,F,"Bachelor's degree (BA, AB, BS)",6,4453,30.3,3239.0,"Bachelor's degree (BA, AB, BS)"
Alabama,AL,2016,F,"Master's degree (MA, MS, MEng, MEd, MSW, MBA)",7,1910,32.0,3263.5,Advanced degree
Alabama,AL,2016,F,"Doctorate (PhD, EdD) or Professional Degree (M...",8,487,33.1,3196.7,Advanced degree
Alabama,AL,2016,F,Unknown or Not Stated,-9,65,27.7,3083.9,Unknown or Not Stated
Alabama,AL,2016,M,8th grade or less,1,1188,27.6,3232.9,less than high school


In [36]:
# with high confidence, split the region and keep only the last part
# warning! you could operate on the same column, or create a new one!
df["region_short"] = df["region"].str.split().str.get(-1)
df.query("region_short != 'California'").head()

Unnamed: 0,name,grape,region,variety,rating,notes,variety_short,region_short
7,12 Linajes Crianza 2014,,"Ribera del Duero, Spain",Red Wine,92.0,Red with violet hues. The aromas are very inte...,R,Spain
8,12 Linajes Reserva 2012,,"Ribera del Duero, Spain",Red Wine,94.0,"On the nose, a complex predominance of mineral...",R,Spain
9,14 Hands Cabernet Sauvignon 2010,,"Columbia Valley, Washington",Red Wine,87.0,Concentrated aromas of dark stone fruits and t...,R,Washington
10,14 Hands Cabernet Sauvignon 2011,,"Columbia Valley, Washington",Red Wine,89.0,Concentrated aromas of dark stone fruits and t...,R,Washington
11,14 Hands Cabernet Sauvignon 2015,,"Columbia Valley, Washington",Red Wine,89.0,"The 14 Hands Cabernet Sauvignon is a rich, jui...",R,Washington
