In [2]:
import polars as pl
from warnings import filterwarnings

filterwarnings('ignore')

df = pl.read_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\business.parquet")

In [3]:
"""
now we study about the joining and merging so 

df.join (outter, inner, left, right) all joining 
df.join_asof
df.join_where
df.insert_column
df.merge_sorted

and any kind of joining is avaivale on the polars 
"""

'\nnow we study about the joining and merging so \n\ndf.join (outter, inner, left, right) all joining \ndf.join_asof\ndf.join_where\ndf.insert_column\ndf.merge_sorted\n\nand any kind of joining is avaivale on the polars \n'

In [3]:
business_df = pl.read_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\business.parquet")
checkin_df = pl.scan_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\checkin.parquet")
review_df = pl.scan_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\review.parquet")
tip_df = pl.scan_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\tip.parquet")
user_df = pl.scan_parquet(r"C:\Users\Rudra\Desktop\yelp\parquet-data\yelp_user.parquet")

In [5]:
business_df.collect_schema()

Schema([('business_id', String),
        ('name', String),
        ('address', String),
        ('city', String),
        ('state', String),
        ('postal_code', String),
        ('latitude', Float64),
        ('longitude', Float64),
        ('stars', Float64),
        ('review_count', Int64),
        ('is_open', Int64),
        ('attributes', String),
        ('categories', String),
        ('hours', String)])

In [6]:
checkin_df.collect_schema()

Schema([('business_id', String), ('date', String)])

In [7]:
review_df.collect_schema()

Schema([('review_id', String),
        ('user_id', String),
        ('business_id', String),
        ('stars', Int64),
        ('useful', Int64),
        ('funny', Int64),
        ('cool', Int64),
        ('text', String),
        ('date', String)])

In [8]:
tip_df.collect_schema()

Schema([('user_id', String),
        ('business_id', String),
        ('text', String),
        ('date', String),
        ('compliment_count', Int64)])

In [9]:
user_df.collect_schema()

Schema([('user_id', String),
        ('name', String),
        ('review_count', Int64),
        ('yelping_since', String),
        ('useful', Int64),
        ('funny', Int64),
        ('cool', Int64),
        ('elite', String),
        ('friends', String),
        ('fans', Int64),
        ('average_stars', Float64),
        ('compliment_hot', Int64),
        ('compliment_more', Int64),
        ('compliment_profile', Int64),
        ('compliment_cute', Int64),
        ('compliment_list', Int64),
        ('compliment_note', Int64),
        ('compliment_plain', Int64),
        ('compliment_cool', Int64),
        ('compliment_funny', Int64),
        ('compliment_writer', Int64),
        ('compliment_photos', Int64)])

# <strong style="color:#5e17eb">  1. Check & Match</strong>


**⚡ Notes for Big Data Efficiency**
- Use `literal=True` for speed when not using regex.

- Use `.contains_any()` for multiple search terms.

- Chain filters in lazy mode with `.scan_parquet()` or `.scan_csv()` to save RAM.

- Avoid applying regex on *entire dataset* unless really needed.



## <strong style="color:#5e17eb"> contains(pattern, literal=false) </strong>
*Purpose:* Check if a string contains a substring or matches a regex pattern.

- `literal=True` → match exact substring

- `literal=False` (default) → pattern is regex
- https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.contains.html

In [None]:
# Businesses in the "categories" column that contain "Doctors"
business_df.filter(
    pl.col("categories").str.contains("Doctors", literal=True)
).select(["name", "categories"]).sample(3)

name,categories
str,str
"""St Alphonsus Urgent Care""","""Hospitals, Doctors, Family Pra…"
"""Aesthetic Dermatology Associat…","""Medical Spas, Doctors, Laser H…"
"""HonorHealth | FastMed Urgent C…","""Urgent Care, Health & Medical,…"


## <strong style="color:#5e17eb">  contains_any([patterns])</strong>


In [30]:
patterns = ["Doctors", "Acupuncture", "Nutritionists"]

business_df.filter(
    pl.col("categories").str.contains_any(patterns)
).select(["name", "categories"]).sample(3)


name,categories
str,str
"""ATI Physical Therapy""","""Occupational Therapy, Rehabili…"
"""Awaken The Glow with Jourdan R…","""Supernatural Readings, Arts & …"
"""Forefront Dermatology""","""Doctors, Dermatologists, Healt…"


## <strong style="color:#5e17eb">starts_with(prefix) &  ends_with(suffix)</strong>

- **Starts with**: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.bin.starts_with.html#polars.Expr.bin.starts_with
- **Ends with** : https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.bin.ends_with.html#polars.Expr.bin.ends_with

In [32]:
business_df.filter(
    pl.col("city").str.starts_with("san")
)

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str
"""Yoy4h6G8S2F0_3-WvS8y1A""","""Arrow Electric""","""1020 State St""","""santa Barbara""","""CA""","""93101""",34.422213,-119.702331,4.5,15,1,"""{'BusinessAcceptsCreditCards':…","""Electricians, Home Services, U…","""{'Saturday': '8:0-8:15'}"""


In [38]:
business_df.filter(
    pl.col("city").str.ends_with("c")
).limit(1)

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str
"""KArezUwWY4Tlr2UCrAyREA""","""Stonewater Spa Salon & Boutiqu…","""3 Plaza Frontenac""","""Frontenac""","""MO""","""63131""",38.629942,-90.407369,3.5,27,1,"""{'RestaurantsPriceRange2': '3'…","""Hair Salons, Nail Salons, Mass…","""{'Monday': '9:0-18:0', 'Tuesda…"


## <strong style="color:#5e17eb"> find(pattern, literal=False) </strong>


In [40]:
# Example: Position of word "Doctors" in categories
business_df.select([
    pl.col("categories"),
    pl.col("categories").str.find("Doctors", literal=True).alias("position")
])


categories,position
str,u32
"""Doctors, Traditional Chinese M…",0
"""Shipping Centers, Local Servic…",
"""Department Stores, Shopping, F…",
"""Restaurants, Food, Bubble Tea,…",
"""Brewpubs, Breweries, Food""",
…,…
"""Nail Salons, Beauty & Spas""",
"""Pets, Nurseries & Gardening, P…",
"""Shopping, Jewelry, Piercing, T…",
"""Fitness/Exercise Equipment, Ey…",


In [52]:
business_df.select([
    pl.col("categories"),
    pl.col("categories").str.find("Doctors", literal=True).alias("position")
]).filter(pl.col("position") > 3).sample(3)

categories,position
str,u32
"""Optometrists, Ophthalmologists…",53
"""Gastroenterologist, Health & M…",38
"""Dermatologists, Doctors, Healt…",16


## <strong style="color:#5e17eb"> Summary </strong>


Apply patterns, detect substrings, or filter rows.

- `contains(pattern, literal=False)` – regex or literal match

- `contains_any([patterns])` – match any from list; efficient scanning 
- `starts_with(prefix)` / `ends_with(suffix`) – quick prefix/suffix tests 
- `find(pattern, literal=False)` – position of first match (useful instead of boolean)

# <strong style="color:#5e17eb"> 2. Extract & Count </strong>


## <strong style="color:#5e17eb"> extract(, group_index) </strong>


In [58]:
business_df.select([
    pl.col("name"),
    pl.col("name").str.extract(r"(\w+)", group_index=1).alias("first_name")
])

name,first_name
str,str
"""Abby Rappoport, LAC, CMQ""","""Abby"""
"""The UPS Store""","""The"""
"""Target""","""Target"""
"""St Honore Pastries""","""St"""
"""Perkiomen Valley Brewery""","""Perkiomen"""
…,…
"""Binh's Nails""","""Binh"""
"""Wild Birds Unlimited""","""Wild"""
"""Claire's Boutique""","""Claire"""
"""Cyclery & Fitness Center""","""Cyclery"""


## <strong style="color:#5e17eb"> extract_all(pattern) </strong>
-  Extract all matches of a regex into a list.

In [None]:
# # Extract all words starting with a capital letter in categories
patterns = r"\b[A-Z][a-zA-Z]+\b"

business_df.select([
    pl.col("categories"),
    pl.col("categories").str.extract_all(patterns).alias("capital_words")
]).sample(3)

categories,capital_word
str,list[str]
"""Doctors, Traditional Chinese M…","[""Doctors"", ""Traditional"", … ""Nutritionists""]"
"""Shipping Centers, Local Servic…","[""Shipping"", ""Centers"", … ""Services""]"
"""Department Stores, Shopping, F…","[""Department"", ""Stores"", … ""Stores""]"
"""Restaurants, Food, Bubble Tea,…","[""Restaurants"", ""Food"", … ""Bakeries""]"
"""Brewpubs, Breweries, Food""","[""Brewpubs"", ""Breweries"", ""Food""]"
…,…
"""Nail Salons, Beauty & Spas""","[""Nail"", ""Salons"", … ""Spas""]"
"""Pets, Nurseries & Gardening, P…","[""Pets"", ""Nurseries"", … ""Shopping""]"
"""Shopping, Jewelry, Piercing, T…","[""Shopping"", ""Jewelry"", … ""Fashion""]"
"""Fitness/Exercise Equipment, Ey…","[""Fitness"", ""Exercise"", … ""Bikes""]"


## <strong style="color:#5e17eb"> extract_groups(pattern) </strong>
-  Extract all capture groups from the first match as separate columns.
-  https://regex101.com/

In [61]:
# Extract latitude and longitude from a string (if combined in one field)
sample_df = pl.DataFrame({"coords": ["34.4266787,-119.7111968"]})

sample_df.select(
    pl.col("coords").str.extract_groups(r"([-]?\d+\.\d+),([-]?\d+\.\d+)")
)


coords
struct[2]
"{""34.4266787"",""-119.7111968""}"


## <strong style="color:#5e17eb"> count_matches(pattern) </strong>
- As the name suggest how many time matches occur

In [65]:
business_df.select([
    (pl.col("categories")),
    (pl.col("categories").str.count_matches(r",")).alias("comma_count"),
    (pl.col("categories").str.count_matches(r",") + 1).alias("num_categories")
])

categories,comma_count,num_categories
str,u32,u32
"""Doctors, Traditional Chinese M…",5,6
"""Shipping Centers, Local Servic…",4,5
"""Department Stores, Shopping, F…",5,6
"""Restaurants, Food, Bubble Tea,…",4,5
"""Brewpubs, Breweries, Food""",2,3
…,…,…
"""Nail Salons, Beauty & Spas""",1,2
"""Pets, Nurseries & Gardening, P…",6,7
"""Shopping, Jewelry, Piercing, T…",6,7
"""Fitness/Exercise Equipment, Ey…",4,5


## <strong style="color:#5e17eb"> len_chars() / len_bytes() </strong>

- Get the length of strings in bytes (important for non-ASCII text).
- Alias for .len_chars() — counts number of characters.

In [72]:
business_df.select([
    (pl.col("name")),
    (pl.col("name").str.len_bytes()).alias("byte_length"),
    (pl.col("name").str.len_chars()).alias("length_char"),
])

name,byte_length,length_char
str,u32,u32
"""Abby Rappoport, LAC, CMQ""",24,24
"""The UPS Store""",13,13
"""Target""",6,6
"""St Honore Pastries""",18,18
"""Perkiomen Valley Brewery""",24,24
…,…,…
"""Binh's Nails""",12,12
"""Wild Birds Unlimited""",20,20
"""Claire's Boutique""",17,17
"""Cyclery & Fitness Center""",24,24


## <strong style="color:#5e17eb"> Summary </strong>

Pull out substrings or count occurrences.

- `extract(pattern[, group_index])` – first regex group

- `extract_all(pattern)` / `extract_groups(pattern)` – all matches or all capture groups

- `count_match(pattern)` / `count_matches(pattern)` – count regex matches

- `len_chars()` / `len_bytes() `/ `lengths()` – string lengths in chars or bytes

# <strong style="color:#5e17eb"> 3. Modify & Clean </strong>


## <strong style="color:#5e17eb"> to_lowercase() / to_uppercase() / to_titlecase() </strong>


In [73]:
business_df.select([
    (pl.col("city")),
    (pl.col("city").str.to_lowercase()).alias("city_lower"),
    (pl.col("city").str.to_uppercase()).alias("city_upper"),
    (pl.col("city").str.to_titlecase()).alias("city_title"),

])

city,city_lower,city_upper,city_title
str,str,str,str
"""Santa Barbara""","""santa barbara""","""SANTA BARBARA""","""Santa Barbara"""
"""Affton""","""affton""","""AFFTON""","""Affton"""
"""Tucson""","""tucson""","""TUCSON""","""Tucson"""
"""Philadelphia""","""philadelphia""","""PHILADELPHIA""","""Philadelphia"""
"""Green Lane""","""green lane""","""GREEN LANE""","""Green Lane"""
…,…,…,…
"""Edmonton""","""edmonton""","""EDMONTON""","""Edmonton"""
"""Nashville""","""nashville""","""NASHVILLE""","""Nashville"""
"""Indianapolis""","""indianapolis""","""INDIANAPOLIS""","""Indianapolis"""
"""Edwardsville""","""edwardsville""","""EDWARDSVILLE""","""Edwardsville"""


## <strong style="color:#5e17eb">strip() / strip_chars(chars="xyz) / strip_prefix() / strip_suffix() </strong>


In [75]:
business_df.select([
    (pl.col("address")),
    (pl.col("address").str.strip_chars()).alias("address_trimmed")
])

address,address_trimmed
str,str
"""1616 Chapala St, Ste 2""","""1616 Chapala St, Ste 2"""
"""87 Grasso Plaza Shopping Cente…","""87 Grasso Plaza Shopping Cente…"
"""5255 E Broadway Blvd""","""5255 E Broadway Blvd"""
"""935 Race St""","""935 Race St"""
"""101 Walnut St""","""101 Walnut St"""
…,…
"""3388 Gateway Blvd""","""3388 Gateway Blvd"""
"""2813 Bransford Ave""","""2813 Bransford Ave"""
"""6020 E 82nd St, Ste 46""","""6020 E 82nd St, Ste 46"""
"""2472 Troy Rd""","""2472 Troy Rd"""


Remove specific characters from start and end.

In [None]:
business_df.select([
    (pl.col("postal_code")),
    (pl.col("postal_code").str.strip_chars(characters="-")).alias("postal_trimmed")
])

postal_code,d
str,str
"""93101""","""93101"""
"""63123""","""63123"""
"""85711""","""85711"""
"""19107""","""19107"""
"""18054""","""18054"""
…,…
"""T6J 5H2""","""T6J 5H2"""
"""37204""","""37204"""
"""46250""","""4625"""
"""62025""","""62025"""


In [None]:
business_df.select([
    pl.col("name"),
    pl.col("name").str.strip_prefix("st").alias("name_no_prefix")
]).sample(3)

name,address_no_prefix
str,str
"""The Tennessean""","""The Tennessean"""
"""Medora's Mecca""","""Medora's Mecca"""
"""Salinas 2""","""Salinas 2"""


In [83]:
business_df.select([
    pl.col("address"),
    pl.col("address").str.strip_suffix(" St").alias("address_no_suffix")
]).sample(3)


address,address_no_suffix
str,str
"""1991 Sproul Rd""","""1991 Sproul Rd"""
"""435 W Chew Ave""","""435 W Chew Ave"""
"""500 Broad St, Ste 4""","""500 Broad St, Ste 4"""


## <strong style="color:#5e17eb">  strip_chars_start() / strip_chars_end() / zfill(width) </strong>


In [None]:
business_df.select([
    pl.col("address"),
    pl.col("address").str.strip_chars_start().alias("address_left_trim"),
    pl.col("address").str.strip_chars_end().alias("address_right_trim")
])

address,address_left_trim,address_right_trim
str,str,str
"""1616 Chapala St, Ste 2""","""1616 Chapala St, Ste 2""","""1616 Chapala St, Ste 2"""
"""87 Grasso Plaza Shopping Cente…","""87 Grasso Plaza Shopping Cente…","""87 Grasso Plaza Shopping Cente…"
"""5255 E Broadway Blvd""","""5255 E Broadway Blvd""","""5255 E Broadway Blvd"""
"""935 Race St""","""935 Race St""","""935 Race St"""
"""101 Walnut St""","""101 Walnut St""","""101 Walnut St"""
…,…,…
"""3388 Gateway Blvd""","""3388 Gateway Blvd""","""3388 Gateway Blvd"""
"""2813 Bransford Ave""","""2813 Bransford Ave""","""2813 Bransford Ave"""
"""6020 E 82nd St, Ste 46""","""6020 E 82nd St, Ste 46""","""6020 E 82nd St, Ste 46"""
"""2472 Troy Rd""","""2472 Troy Rd""","""2472 Troy Rd"""


- Pad the string with leading zeros to a fixed width.
- Ensuring fixed-length IDs, zip codes, or numeric codes.

In [105]:
business_df.select([
    pl.col("postal_code"),
    pl.col("postal_code").str.zfill(10).alias("postal_zfill")
])


postal_code,postal_zfill
str,str
"""93101""","""0000093101"""
"""63123""","""0000063123"""
"""85711""","""0000085711"""
"""19107""","""0000019107"""
"""18054""","""0000018054"""
…,…
"""T6J 5H2""","""000T6J 5H2"""
"""37204""","""0000037204"""
"""46250""","""0000046250"""
"""62025""","""0000062025"""


## <strong style="color:#5e17eb"> Summary </strong>

Transform, replace or trim text quickly.

- `to_lowercase()`, `to_uppercase()`, `to_titlecase()` – convert case 


- `strip()` / `strip_chars()` / `strip_prefix()` / `strip_suffix()` – trimming whitespace or specific chars 


- `lstrip()`, `rstrip()` – left- or right-only trims

- `zfill(width)` – pad with zeros at start (e.g. for fixed-length codes)


# <strong style="color:#5e17eb">4. Replace & Transform  </strong>


## <strong style="color:#5e17eb">  replace(pattern, value, literal=False, n=1)</strong>

- https://docs.rs/regex/latest/regex/
- `n` is controls how many matches Polars will replace.

In [109]:
# Replace first comma in categories with a slash
business_df.select([
    (pl.col("categories")),
    (pl.col("categories").str.replace(",", "/",n=2,  literal=True)).alias("categories_first_replace")
])

categories,categories_first_replace
str,str
"""Doctors, Traditional Chinese M…","""Doctors/ Traditional Chinese M…"
"""Shipping Centers, Local Servic…","""Shipping Centers/ Local Servic…"
"""Department Stores, Shopping, F…","""Department Stores/ Shopping/ F…"
"""Restaurants, Food, Bubble Tea,…","""Restaurants/ Food/ Bubble Tea,…"
"""Brewpubs, Breweries, Food""","""Brewpubs/ Breweries/ Food"""
…,…
"""Nail Salons, Beauty & Spas""","""Nail Salons/ Beauty & Spas"""
"""Pets, Nurseries & Gardening, P…","""Pets/ Nurseries & Gardening/ P…"
"""Shopping, Jewelry, Piercing, T…","""Shopping/ Jewelry/ Piercing, T…"
"""Fitness/Exercise Equipment, Ey…","""Fitness/Exercise Equipment/ Ey…"


## <strong style="color:#5e17eb"> replace_all(pattern, value, literal=False) </strong>
- replace all matches at a time
- Global string replacement, e.g., formatting separators, cleaning up symbols.




In [111]:
# Replace all commas with " | " in categories
business_df.select([
    (pl.col("categories")),
    (pl.col("categories").str.replace_all(",", " | ", literal=True)).alias("categories_all_replace")
])

categories,categories_all_replace
str,str
"""Doctors, Traditional Chinese M…","""Doctors | Traditional Chinese…"
"""Shipping Centers, Local Servic…","""Shipping Centers | Local Serv…"
"""Department Stores, Shopping, F…","""Department Stores | Shopping …"
"""Restaurants, Food, Bubble Tea,…","""Restaurants | Food | Bubble …"
"""Brewpubs, Breweries, Food""","""Brewpubs | Breweries | Food"""
…,…
"""Nail Salons, Beauty & Spas""","""Nail Salons | Beauty & Spas"""
"""Pets, Nurseries & Gardening, P…","""Pets | Nurseries & Gardening …"
"""Shopping, Jewelry, Piercing, T…","""Shopping | Jewelry | Piercin…"
"""Fitness/Exercise Equipment, Ey…","""Fitness/Exercise Equipment | …"


## <strong style="color:#5e17eb">replace_many(patterns, replace_with_list)  </strong>


In [117]:
# Replace certain words in categories
patterns = ["Doctors", "Health & Medical", "Acupuncture"]
replacements = ["Physicians", "Healthcare", "Needle Therapy"]

business_df.select([
    pl.col("categories"),
    pl.col("categories").str.replace_many(patterns, replacements).alias("categories_bulk_replace")
]).sample(3)


categories,categories_bulk_replace
str,str
"""Home & Garden, Furniture Store…","""Home & Garden, Furniture Store…"
"""Vietnamese, Restaurants""","""Vietnamese, Restaurants"""
"""Medical Spas, Aestheticians, B…","""Medical Spas, Aestheticians, B…"


## <strong style="color:#5e17eb"> reverse() </strong>


In [118]:
# Example: Reverse business names
business_df.select([
    pl.col("name"),
    pl.col("name").str.reverse().alias("name_reversed")
])


name,name_reversed
str,str
"""Abby Rappoport, LAC, CMQ""","""QMC ,CAL ,tropoppaR ybbA"""
"""The UPS Store""","""erotS SPU ehT"""
"""Target""","""tegraT"""
"""St Honore Pastries""","""seirtsaP eronoH tS"""
"""Perkiomen Valley Brewery""","""yrewerB yellaV nemoikreP"""
…,…
"""Binh's Nails""","""sliaN s'hniB"""
"""Wild Birds Unlimited""","""detimilnU sdriB dliW"""
"""Claire's Boutique""","""euqituoB s'erialC"""
"""Cyclery & Fitness Center""","""retneC ssentiF & yrelcyC"""


## <strong style="color:#5e17eb"> concat(delimiter, ignore_nulls=True) </strong>


In [None]:
# Merge city and state into "City, State"
business_df.select([
    pl.concat_str(["city", "state"], separator=", ").alias("city_state")
])


city_state
str
"""Santa Barbara, CA"""
"""Affton, MO"""
"""Tucson, AZ"""
"""Philadelphia, PA"""
"""Green Lane, PA"""
…
"""Edmonton, AB"""
"""Nashville, TN"""
"""Indianapolis, IN"""
"""Edwardsville, IL"""


## <strong style="color:#5e17eb"> Summary </strong>


Modify text with regex or literal patterns.

- `replace(pattern, value, literal=False, n=1)` – first match

- `replace_all(pattern, value, literal=False)` – globally replace

- `replace_many(patterns, replace_with_list`)` – multiple replacements in one go 


- `reverse()` / `concat(delimiter, ignore_nulls=True)` – reverse string or merge across rows

# <strong style="color:#5e17eb"> 5. Split & Slice </strong>


## <strong style="color:#5e17eb">  slice(offset[, length])</strong>


In [122]:
# Last 3 characters of postal code
business_df.select([
    pl.col("postal_code"),
    pl.col("postal_code").str.slice(-3).alias("zip_last3")
]).sample(3)

postal_code,zip_last3
str,str
"""19382""","""382"""
"""63026""","""026"""
"""83642""","""642"""


## <strong style="color:#5e17eb">head(n) / tail(n) /  </strong>


In [123]:
business_df.head(2)

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str
"""Pns2l4eNsfO8kk83dixA6A""","""Abby Rappoport, LAC, CMQ""","""1616 Chapala St, Ste 2""","""Santa Barbara""","""CA""","""93101""",34.426679,-119.711197,5.0,7,0,"""{'ByAppointmentOnly': 'True'}""","""Doctors, Traditional Chinese M…",
"""mpf3x-BjTdTEA3yCZrAYPw""","""The UPS Store""","""87 Grasso Plaza Shopping Cente…","""Affton""","""MO""","""63123""",38.551126,-90.335695,3.0,15,1,"""{'BusinessAcceptsCreditCards':…","""Shipping Centers, Local Servic…","""{'Monday': '0:0-0:0', 'Tuesday…"


In [124]:
business_df.tail(2)

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str
"""mtGm22y5c2UHNXDFAjaPNw""","""Cyclery & Fitness Center""","""2472 Troy Rd""","""Edwardsville""","""IL""","""62025""",38.782351,-89.950558,4.0,24,1,"""{'BusinessParking': ""{'garage'…","""Fitness/Exercise Equipment, Ey…","""{'Monday': '9:0-20:0', 'Tuesda…"
"""jV_XOycEzSlTx-65W906pg""","""Sic Ink""","""238 Apollo Beach Blvd""","""Apollo beach""","""FL""","""33572""",27.771002,-82.39491,4.5,9,1,"""{'WheelchairAccessible': 'True…","""Beauty & Spas, Permanent Makeu…","""{'Tuesday': '12:0-19:0', 'Wedn…"


## <strong style="color:#5e17eb">  split(by, inclusive=False) / split_exact(by, n) / splitn(by, n) </strong>


In [125]:
# Split categories into list
business_df.select([
    pl.col("categories"),
    pl.col("categories").str.split(", ").alias("categories_list")
])


categories,categories_list
str,list[str]
"""Doctors, Traditional Chinese M…","[""Doctors"", ""Traditional Chinese Medicine"", … ""Nutritionists""]"
"""Shipping Centers, Local Servic…","[""Shipping Centers"", ""Local Services"", … ""Printing Services""]"
"""Department Stores, Shopping, F…","[""Department Stores"", ""Shopping"", … ""Furniture Stores""]"
"""Restaurants, Food, Bubble Tea,…","[""Restaurants"", ""Food"", … ""Bakeries""]"
"""Brewpubs, Breweries, Food""","[""Brewpubs"", ""Breweries"", ""Food""]"
…,…
"""Nail Salons, Beauty & Spas""","[""Nail Salons"", ""Beauty & Spas""]"
"""Pets, Nurseries & Gardening, P…","[""Pets"", ""Nurseries & Gardening"", … ""Shopping""]"
"""Shopping, Jewelry, Piercing, T…","[""Shopping"", ""Jewelry"", … ""Fashion""]"
"""Fitness/Exercise Equipment, Ey…","[""Fitness/Exercise Equipment"", ""Eyewear & Opticians"", … ""Bikes""]"


In [None]:
# Example: Split address into first and second part 
business_df.select(
    pl.col("address").str.split_exact(",", 1).alias("address_parts")
)


address_parts
struct[2]
"{""1616 Chapala St"","" Ste 2""}"
"{""87 Grasso Plaza Shopping Center"",null}"
"{""5255 E Broadway Blvd"",null}"
"{""935 Race St"",null}"
"{""101 Walnut St"",null}"
…
"{""3388 Gateway Blvd"",null}"
"{""2813 Bransford Ave"",null}"
"{""6020 E 82nd St"","" Ste 46""}"
"{""2472 Troy Rd"",null}"


In [131]:
# Split categories into max 3 parts
business_df.select([
    pl.col("categories"),
    pl.col("categories").str.splitn(", ", 2).alias("categories_top3")
])


categories,categories_top3
str,struct[2]
"""Doctors, Traditional Chinese M…","{""Doctors"",""Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists""}"
"""Shipping Centers, Local Servic…","{""Shipping Centers"",""Local Services, Notaries, Mailbox Centers, Printing Services""}"
"""Department Stores, Shopping, F…","{""Department Stores"",""Shopping, Fashion, Home & Garden, Electronics, Furniture Stores""}"
"""Restaurants, Food, Bubble Tea,…","{""Restaurants"",""Food, Bubble Tea, Coffee & Tea, Bakeries""}"
"""Brewpubs, Breweries, Food""","{""Brewpubs"",""Breweries, Food""}"
…,…
"""Nail Salons, Beauty & Spas""","{""Nail Salons"",""Beauty & Spas""}"
"""Pets, Nurseries & Gardening, P…","{""Pets"",""Nurseries & Gardening, Pet Stores, Hobby Shops, Bird Shops, Home & Garden, Shopping""}"
"""Shopping, Jewelry, Piercing, T…","{""Shopping"",""Jewelry, Piercing, Toy Stores, Beauty & Spas, Accessories, Fashion""}"
"""Fitness/Exercise Equipment, Ey…","{""Fitness/Exercise Equipment"",""Eyewear & Opticians, Shopping, Sporting Goods, Bikes""}"


## <strong style="color:#5e17eb"> Summary </strong>


Break strings into parts.

- `slice(offset[, length])` – substring by position (supports negatives) 


- `head(n)`, `tail(n)` – first or last n characters

- `split(by, inclusive=False)`, `split_exact(by, n)`, `splitn(by, n)` – divide string into list of parts

# <strong style="color:#5e17eb"> 6. Parsing & Conversion </strong>


## <strong style="color:#5e17eb"> strptime(dtype, format, strict=True) </strong>

In [132]:
# Example: Suppose we had a string date column
df_dates = pl.DataFrame({
    "open_date_str": ["2023-05-21", "2022-12-01", "2024-01-15"]
})

df_dates.select([
    pl.col("open_date_str").str.strptime(pl.Date, "%Y-%m-%d").alias("open_date")
])


open_date
date
2023-05-21
2022-12-01
2024-01-15


## <strong style="color:#5e17eb">  to_date() / to_datetime() / to_time()</strong>

In [133]:
df_dates.select([
    pl.col("open_date_str").str.to_date("%Y-%m-%d").alias("open_date")
])


open_date
date
2023-05-21
2022-12-01
2024-01-15


## <strong style="color:#5e17eb"> to_integer(base=10, strict=True) / parse_int() </strong>

In [134]:
# Convert postal code string to integer
business_df.select([
    pl.col("postal_code"),
    pl.col("postal_code").str.to_integer(strict=False).alias("postal_code_int")
])


postal_code,postal_code_int
str,i64
"""93101""",93101
"""63123""",63123
"""85711""",85711
"""19107""",19107
"""18054""",18054
…,…
"""T6J 5H2""",
"""37204""",37204
"""46250""",46250
"""62025""",62025


## <strong style="color:#5e17eb"> to_decimal() </strong>

In [6]:
df_price = pl.DataFrame({"price_str": ["12.50", "100.05", "7.00"]})

print(df_price.dtypes)

df_price.select([
    pl.col("price_str").str.to_decimal().alias("price_decimal")
])


[String]


price_decimal
"decimal[*,2]"
12.5
100.05
7.0


In [9]:
business_df['attributes']

attributes
str
"""{'ByAppointmentOnly': 'True'}"""
"""{'BusinessAcceptsCreditCards':…"
"""{'BikeParking': 'True', 'Busin…"
"""{'RestaurantsDelivery': 'False…"
"""{'BusinessAcceptsCreditCards':…"
…
"""{'ByAppointmentOnly': 'False',…"
"""{'BusinessAcceptsCreditCards':…"
"""{'RestaurantsPriceRange2': '1'…"
"""{'BusinessParking': ""{'garage'…"


In [15]:
business_df.with_columns(
    pl.col("attributes")
      .str.replace_all("'", '"')  # make it valid JSON
      .alias("attributes_json")
)['attributes_json'].to_list()

['{"ByAppointmentOnly": "True"}',
 '{"BusinessAcceptsCreditCards": "True"}',
 '{"BikeParking": "True", "BusinessAcceptsCreditCards": "True", "RestaurantsPriceRange2": "2", "CoatCheck": "False", "RestaurantsTakeOut": "False", "RestaurantsDelivery": "False", "Caters": "False", "WiFi": "u"no"", "BusinessParking": "{"garage": False, "street": False, "validated": False, "lot": True, "valet": False}", "WheelchairAccessible": "True", "HappyHour": "False", "OutdoorSeating": "False", "HasTV": "False", "RestaurantsReservations": "False", "DogsAllowed": "False", "ByAppointmentOnly": "False"}',
 '{"RestaurantsDelivery": "False", "OutdoorSeating": "False", "BusinessAcceptsCreditCards": "False", "BusinessParking": "{"garage": False, "street": True, "validated": False, "lot": False, "valet": False}", "BikeParking": "True", "RestaurantsPriceRange2": "1", "RestaurantsTakeOut": "True", "ByAppointmentOnly": "False", "WiFi": "u"free"", "Alcohol": "u"none"", "Caters": "True"}',
 '{"BusinessAcceptsCreditCar

In [17]:
import json

def safe_json_parse(s: str):
    if s is None:
        return None
    try:
        # Replace single quotes with double quotes
        s = s.replace("'", '"')
        return json.loads(s)
    except Exception:
        return None  # if still fails, return None

# Apply in Polars using map_elements
business_df_decoded = business_df.with_columns(
    pl.col("attributes").map_elements(safe_json_parse, return_dtype=pl.Object).alias("attributes_dict")
)

print(business_df_decoded.select(["attributes", "attributes_dict"]).head(5))


shape: (5, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ attributes                      ┆ attributes_dict                 │
│ ---                             ┆ ---                             │
│ str                             ┆ object                          │
╞═════════════════════════════════╪═════════════════════════════════╡
│ {'ByAppointmentOnly': 'True'}   ┆ {'ByAppointmentOnly': 'True'}   │
│ {'BusinessAcceptsCreditCards':… ┆ {'BusinessAcceptsCreditCards':… │
│ {'BikeParking': 'True', 'Busin… ┆ null                            │
│ {'RestaurantsDelivery': 'False… ┆ null                            │
│ {'BusinessAcceptsCreditCards':… ┆ null                            │
└─────────────────────────────────┴─────────────────────────────────┘


In [24]:
# Convert single quotes to double quotes
business_df_clean = business_df.with_columns(
    pl.col("attributes")
      .str.replace_all("'", '"') 
      .alias("attributes_json")
)

business_df_clean['attributes_json']

attributes_json
str
"""{""ByAppointmentOnly"": ""True""}"""
"""{""BusinessAcceptsCreditCards"":…"
"""{""BikeParking"": ""True"", ""Busin…"
"""{""RestaurantsDelivery"": ""False…"
"""{""BusinessAcceptsCreditCards"":…"
…
"""{""ByAppointmentOnly"": ""False"",…"
"""{""BusinessAcceptsCreditCards"":…"
"""{""RestaurantsPriceRange2"": ""1""…"
"""{""BusinessParking"": ""{""garage""…"


json_path_match

In [26]:
# $.ByAppointmentOnly means: in the JSON, get the value for "ByAppointmentOnly".
business_df_with_field = business_df_clean.with_columns(
    pl.col("attributes_json")
      .str.json_path_match("$.ByAppointmentOnly")
      .alias("ByAppointmentOnly")
)
business_df_with_field['ByAppointmentOnly']

ByAppointmentOnly
str
"""True"""
""
""
""
""
…
"""False"""
""
""
""


## <strong style="color:#5e17eb"> Summary </strong>


Convert string to numeric, date/time, JSON.

- `strptime(dtype, format, strict=True)` – parse date/datetime/time

- `to_date()`, `to_datetime()`, `to_time()` – shorthand type conversion

- `to_integer(base=10, strict=True)`, `parse_int(base, strict)` – convert to integer

- `to_decimal()` – string to Decimal type

- `json_decode()`, `json_extract()`, `json_path_match()` – parse JSON strings within cells 
- json_decode : https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.Expr.str.json_decode.html
- json_extract : https://docs.pola.rs/docs/python/version/0.19/reference/expressions/api/polars.Expr.str.json_extract.html
- json_path_match : https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.Expr.str.json_path_match.html
- Data frame : https://docs.pola.rs/api/python/stable/reference/dataframe/index.html

# <strong style="color:#5e17eb"> 7. Summary </strong>


| Category             | Key Methods                                                    | Typical Use Case                            |
| -------------------- | -------------------------------------------------------------- | ------------------------------------------- |
| Matching & Filtering | `contains()`, `contains_any()`, `starts_with()`, `ends_with()` | Filter or flag rows by substring            |
| Extract & Count      | `extract()`, `count_matches()`, `len_chars()`                  | Pull out data and compute string metrics    |
| Cleaning Text        | `to_lowercase()`, `strip()`, `zfill()`                         | Standardize and clean raw string columns    |
| Replace Text         | `replace_all()`, `replace_many()`                              | Bulk clean or censor text data              |
| Substring / Parts    | `slice()`, `split()`, `head()`, `tail()`                       | Extract codes, IDs, tokens, etc.            |
| Parse Types          | `to_datetime()`, `to_integer()`, `json_decode()`               | Convert raw text into typed, usable formats |


<strong style="color:#5e17eb">Resources  </strong>

- https://docs.pola.rs/api/python/stable/reference/series/string.html
- https://regex101.com/

<div style="text-align: center;">
  <h4 style="
    display: inline-block;
    color: #5e17eb;
    font-family: 'Segoe UI';
    border-left: 5px solid #5e17eb;
    background-color: #F8F9F9;
    padding: 10px 20px;
    border-radius: 5px;
    text-align: left;
  ">
  <b>
    Thank You 💜
    </b>
  </h4>
</div>