# 🔤 Standardizing Text and Categorical Data

This notebook guides you through identifying and fixing messy text and category values.

In [1]:
# Step 1: Load the messy dataset
import pandas as pd

df = pd.DataFrame({
    'state': ['CA', 'ca', 'California', 'Calif.', 'Cali', 'california ', '  ca'],
    'gender': ['M', 'F', 'Femail', 'male', 'F', 'f', 'FEMALE'],
    'city': [' New York', 'new york', 'NEW YORK', 'Los Angeles', 'los angeles', 'L.A.', 'la'],
    'response': [' Yes', 'no', 'yes ', 'No ', 'YES', 'nO', ' yes']
})
df

Unnamed: 0,state,gender,city,response
0,CA,M,New York,Yes
1,ca,F,new york,no
2,California,Femail,NEW YORK,yes
3,Calif.,male,Los Angeles,No
4,Cali,F,los angeles,YES
5,california,f,L.A.,nO
6,ca,FEMALE,la,yes


## Step 2: Clean and standardize 'state'

In [2]:
# Strip whitespace and convert to lowercase
df['state'] = df['state'].str.strip().str.lower()

# Replace synonyms with 'california'
df['state'] = df['state'].replace({
    'ca': 'california',
    'calif.': 'california',
    'cali': 'california'
})
df['state'].value_counts()

california    7
Name: state, dtype: int64

## Step 3: Clean and map 'gender' values

In [3]:
# Convert to lowercase and map variations
df['gender'] = df['gender'].str.lower()
df['gender'] = df['gender'].replace({
    'm': 'male',
    'f': 'female',
    'femail': 'female'
})
df['gender'].value_counts()

female    5
male      2
Name: gender, dtype: int64

## Step 4: Standardize 'city' values

In [4]:
# Clean city names
df['city'] = df['city'].str.strip().str.lower()
df['city'] = df['city'].replace({
    'l.a.': 'los angeles',
    'la': 'los angeles'
})
df['city'].value_counts()

los angeles    4
new york       3
Name: city, dtype: int64

## Step 5: Clean 'response' values and convert to boolean

In [5]:
# Strip and lowercase, then convert
df['response'] = df['response'].str.strip().str.lower()
df['response_bool'] = df['response'].map({'yes': True, 'no': False})
df[['response', 'response_bool']]

Unnamed: 0,response,response_bool
0,yes,True
1,no,False
2,yes,True
3,no,False
4,yes,True
5,no,False
6,yes,True


## Final Cleaned Dataset

In [6]:
df

Unnamed: 0,state,gender,city,response,response_bool
0,california,male,new york,yes,True
1,california,female,new york,no,False
2,california,female,new york,yes,True
3,california,male,los angeles,no,False
4,california,female,los angeles,yes,True
5,california,female,los angeles,no,False
6,california,female,los angeles,yes,True
