# Chapter 7: Working with Nested Data Structures

## Technical Requirements

In [167]:
import polars as pl

In [168]:
df = pl.read_csv("../data/us_videos.csv", try_parse_dates=True)
df.glimpse(max_items_per_column=2)

Rows: 40949
Columns: 16
$ video_id                             <str> '2kyS6SvSYSE', '1ZAPwfrtAFY'
$ trending_date                        <str> '17.14.11', '17.14.11'
$ title                                <str> 'WE WANT TO TALK ABOUT OUR MARRIAGE', 'The Trump Presidency: Last Week Tonight with John Oliver (HBO)'
$ channel_title                        <str> 'CaseyNeistat', 'LastWeekTonight'
$ category_id                          <i64> 22, 24
$ publish_time           <datetime[μs, UTC]> 2017-11-13 17:13:01+00:00, 2017-11-13 07:30:00+00:00
$ tags                                 <str> 'SHANtell martin', 'last week tonight trump presidency|last week tonight donald trump|john oliver trump|donald trump'
$ views                                <i64> 748374, 2418783
$ likes                                <i64> 57527, 97185
$ dislikes                             <i64> 2966, 6146
$ comment_count                        <i64> 15954, 12703
$ thumbnail_link                       <str> 'https://i.ytimg

In [169]:
df.head()

video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
str,str,str,str,i64,"datetime[μs, UTC]",str,i64,i64,i64,i64,str,bool,bool,bool,str
"""2kyS6SvSYSE""","""17.14.11""","""WE WANT TO TAL…","""CaseyNeistat""",22,2017-11-13 17:13:01 UTC,"""SHANtell marti…",748374,57527,2966,15954,"""https://i.ytim…",False,False,False,"""SHANTELL'S CHA…"
"""1ZAPwfrtAFY""","""17.14.11""","""The Trump Pres…","""LastWeekTonigh…",24,2017-11-13 07:30:00 UTC,"""last week toni…",2418783,97185,6146,12703,"""https://i.ytim…",False,False,False,"""One year after…"
"""5qpjK5DgCt4""","""17.14.11""","""Racist Superma…","""Rudy Mancuso""",23,2017-11-12 19:05:24 UTC,"""racist superma…",3191434,146033,5339,8181,"""https://i.ytim…",False,False,False,"""WATCH MY PREVI…"
"""puqaWrEC7tY""","""17.14.11""","""Nickelback Lyr…","""Good Mythical …",24,2017-11-13 11:00:04 UTC,"""rhett and link…",343168,10172,666,2146,"""https://i.ytim…",False,False,False,"""Today we find …"
"""d380meD0W0M""","""17.14.11""","""I Dare You: GO…","""nigahiga""",24,2017-11-12 18:01:41 UTC,"""ryan|higa|higa…",2095731,132235,1989,17518,"""https://i.ytim…",False,False,False,"""I know it's be…"


In [170]:
df = df.with_columns(pl.col("trending_date").str.strptime(pl.Date, format="%y.%d.%m"))

In [171]:
df.select('trending_date').dtypes[0]

Date

## Creating lists

### How to do it...

In [63]:
df.select(
    'tags',
    pl.col('tags').str.split('|').alias('tags in list')
).head()    

tags,tags in list
str,list[str]
"""SHANtell marti…","[""SHANtell martin""]"
"""last week toni…","[""last week tonight trump presidency"", ""last week tonight donald trump"", … ""donald trump""]"
"""racist superma…","[""racist superman"", ""rudy"", … "" Lele Pons""]"
"""rhett and link…","[""rhett and link"", ""gmm"", … ""challenge""]"
"""ryan|higa|higa…","[""ryan"", ""higa"", … ""fail""]"


In [67]:
(
    df
    .group_by('trending_date')
    .agg(pl.col('video_id'))
).head()

trending_date,video_id
date,list[str]
2018-03-03,"[""HgknAaKNaMM"", ""tugFFhML7VY"", … ""rZQepOFnYi8""]"
2018-02-28,"[""I8Umj580ls0"", ""YskVs5VyqHk"", … ""cy9W-ZywVPc""]"
2018-02-13,"[""uzK1OmxS4CE"", ""Nsm8l89x2H4"", … ""9reizHjwuNY""]"
2018-02-07,"[""wbSwFU6tY1c"", ""JQbjS0_ZfJ0"", … ""LtlkeMfbdpM""]"
2018-02-10,"[""7kLO2AB5SPM"", ""m4faDISwSVo"", … ""Bhplg8YCu-M""]"


In [76]:
df.select(
    pl.concat_list(
        pl.col('views'),
        pl.col('likes'),
        pl.col('dislikes'),
        pl.col('comment_count')
    ).alias('engagement')
).head()

engagement
list[i64]
"[748374, 57527, … 15954]"
"[2418783, 97185, … 12703]"
"[3191434, 146033, … 8181]"
"[343168, 10172, … 2146]"
"[2095731, 132235, … 17518]"


### There is more...

In [124]:
df = pl.DataFrame({
    'nested_list': [
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ], 
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ]
    ]
})


In [125]:
df

nested_list
list[list[i64]]
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"


In [128]:
df = pl.DataFrame({
    'nested_list': [
        [
            ['a',2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ], 
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ]
    ]
})

In [129]:
df

nested_list
object
"[['a', 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]"
"[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]"


## Aggregating elements in lists

### How to do it...

In [1]:
import polars as pl

In [8]:
df = (
    pl.read_csv('../data/us_videos.csv', try_parse_dates=True)
    .with_columns(
        pl.col('trending_date').str.strptime(pl.Date, format='%y.%d.%m')
    )
)
df.head()

video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
str,date,str,str,i64,"datetime[μs, UTC]",str,i64,i64,i64,i64,str,bool,bool,bool,str
"""2kyS6SvSYSE""",2017-11-14,"""WE WANT TO TAL…","""CaseyNeistat""",22,2017-11-13 17:13:01 UTC,"""SHANtell marti…",748374,57527,2966,15954,"""https://i.ytim…",False,False,False,"""SHANTELL'S CHA…"
"""1ZAPwfrtAFY""",2017-11-14,"""The Trump Pres…","""LastWeekTonigh…",24,2017-11-13 07:30:00 UTC,"""last week toni…",2418783,97185,6146,12703,"""https://i.ytim…",False,False,False,"""One year after…"
"""5qpjK5DgCt4""",2017-11-14,"""Racist Superma…","""Rudy Mancuso""",23,2017-11-12 19:05:24 UTC,"""racist superma…",3191434,146033,5339,8181,"""https://i.ytim…",False,False,False,"""WATCH MY PREVI…"
"""puqaWrEC7tY""",2017-11-14,"""Nickelback Lyr…","""Good Mythical …",24,2017-11-13 11:00:04 UTC,"""rhett and link…",343168,10172,666,2146,"""https://i.ytim…",False,False,False,"""Today we find …"
"""d380meD0W0M""",2017-11-14,"""I Dare You: GO…","""nigahiga""",24,2017-11-12 18:01:41 UTC,"""ryan|higa|higa…",2095731,132235,1989,17518,"""https://i.ytim…",False,False,False,"""I know it's be…"


In [9]:
agg_df = (
    df
    .group_by('trending_date')
    .agg(
        'views',
        'likes',
        'dislikes',
        'comment_count'
    )
)
agg_df.head()

trending_date,views,likes,dislikes,comment_count
date,list[i64],list[i64],list[i64],list[i64]
2018-03-09,"[904177, 3008740, … 1159497]","[38381, 402827, … 54838]","[820, 9020, … 2742]","[14888, 35006, … 3687]"
2018-02-22,"[2841864, 1234544, … 111619]","[55049, 170540, … 4993]","[2432, 4176, … 291]","[4420, 26091, … 941]"
2018-02-25,"[237081, 1747075, … 295145]","[19147, 28340, … 4891]","[791, 8928, … 212]","[3869, 44864, … 340]"
2018-02-28,"[2311760, 1397595, … 2926283]","[34425, 64068, … 189088]","[6094, 685, … 1214]","[9453, 3001, … 30892]"
2018-03-03,"[716096, 1108125, … 1327815]","[23734, 146294, … 32895]","[1159, 831, … 1240]","[2057, 17196, … 2020]"


In [20]:
(
    agg_df
    .select(
        'trending_date',
        pl.col('views').list.min().alias('views_min'),
        pl.col('likes').list.max().alias('likes_max'),
        pl.col('dislikes').list.mean().alias('dislikes_mean'),
        pl.col('comment_count').list.sum().alias('comment_sum'),
    )
).head()

trending_date,views_min,likes_max,dislikes_mean,comment_sum
date,i64,i64,f64,i64
2018-03-09,25280,1621717,1756.455,1147010
2018-02-22,1464,1121018,2218.824121,905231
2018-02-25,9197,787827,2340.44,1094805
2018-02-28,7613,1042192,2192.477387,1290226
2018-03-03,7332,2392582,1388.407035,1608245


In [37]:
(
    df
    .group_by('trending_date')
    .agg(pl.col('channel_title'))
    .with_columns(
        pl.col('channel_title').list.join(':')
    )
).head()

trending_date,channel_title
date,str
2018-02-28,"""The Late Show …"
2018-03-06,"""Dude Perfect:D…"
2018-03-09,"""Nintendo:Camil…"
2018-02-16,"""Disney•Pixar:M…"
2018-02-19,"""What's Inside?…"


### There is more...

In [44]:
(
    agg_df
    .select(
        'trending_date',
        pl.col('views').list.len().alias('item_cnt')
    )
).head()

trending_date,item_cnt
date,u32
2018-03-09,200
2018-02-22,199
2018-02-25,200
2018-02-28,199
2018-03-03,199


## Accessing and selecting elements in lists

### Getting ready

In [40]:
import polars as pl

In [48]:
trending_dates_by_channel = (
    df
    .group_by('channel_title')
    .agg('trending_date')
    .with_columns(pl.col('trending_date').list.sort())
)

In [50]:
trending_dates_by_channel.head()

channel_title,trending_date
str,list[date]
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]"
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]"
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]"
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]"
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]"


### How to do it...

In [52]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.first().alias('first_trending_date'),
    pl.col('trending_date').list.last().alias('last_trending_date')
).head()

channel_title,trending_date,first_trending_date,last_trending_date
str,list[date],date,date
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]",2018-01-24,2018-02-12
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]",2017-12-02,2017-12-07
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]",2017-12-29,2018-03-03
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]",2018-04-19,2018-05-14
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]",2017-11-28,2018-01-05


In [73]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.get(1).alias('8th_element')
).head()

channel_title,trending_date,8th_element
str,list[date],date
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]",2018-01-25
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]",2017-12-03
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]",2017-12-30
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]",2018-04-20
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]",2017-11-29


In [74]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.head().alias('first_5'),
    pl.col('trending_date').list.tail(10).alias('last_10')
).head()

channel_title,trending_date,first_5,last_10
str,list[date],list[date],list[date]
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]","[2018-01-24, 2018-01-25, … 2018-01-28]","[2018-01-25, 2018-01-26, … 2018-02-12]"
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]","[2017-12-02, 2017-12-03, … 2017-12-06]","[2017-12-02, 2017-12-03, … 2017-12-07]"
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]","[2017-12-29, 2017-12-30, … 2018-01-02]","[2018-02-06, 2018-02-07, … 2018-03-03]"
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]","[2018-04-19, 2018-04-20, … 2018-04-23]","[2018-04-26, 2018-04-27, … 2018-05-14]"
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]","[2017-11-28, 2017-11-29, … 2017-12-07]","[2017-12-27, 2017-12-28, … 2018-01-05]"


In [76]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date')
    .list.sort(descending=True)
    .list.head(3)
    .alias('3_most_recent_dates')
).head()

channel_title,trending_date,3_most_recent_dates
str,list[date],list[date]
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]","[2018-02-12, 2018-02-11, 2018-02-10]"
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]","[2017-12-07, 2017-12-06, 2017-12-05]"
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]","[2018-03-03, 2018-03-02, 2018-02-13]"
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]","[2018-05-14, 2018-05-04, 2018-05-03]"
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]","[2018-01-05, 2018-01-04, 2018-01-03]"


In [77]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date')
    .list.sort(descending=True)
    .list.head(3)
    .alias('3_most_recent_dates')
).head()

channel_title,trending_date,3_most_recent_dates
str,list[date],list[date]
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]","[2018-02-12, 2018-02-11, 2018-02-10]"
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]","[2017-12-07, 2017-12-06, 2017-12-05]"
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]","[2018-03-03, 2018-03-02, 2018-02-13]"
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]","[2018-05-14, 2018-05-04, 2018-05-03]"
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]","[2018-01-05, 2018-01-04, 2018-01-03]"


In [91]:
trending_dates_by_channel.select(
    'trending_date',
    pl.col('trending_date').list.slice(0, 2).alias('first_2_dates'),
    pl.col('trending_date').list.slice(-3, 1).alias('3rd_date_to_last'),
    pl.col('trending_date').list.slice(7).alias('from_6th_date_to_end')
).head()

trending_date,first_2_dates,3rd_date_to_last,from_6th_date_to_end
list[date],list[date],list[date],list[date]
"[2018-01-24, 2018-01-25, … 2018-02-12]","[2018-01-24, 2018-01-25]",[2018-02-10],"[2018-02-09, 2018-02-10, … 2018-02-12]"
"[2017-12-02, 2017-12-03, … 2017-12-07]","[2017-12-02, 2017-12-03]",[2017-12-05],[]
"[2017-12-29, 2017-12-30, … 2018-03-03]","[2017-12-29, 2017-12-30]",[2018-02-13],"[2018-02-06, 2018-02-07, … 2018-03-03]"
"[2018-04-19, 2018-04-20, … 2018-05-14]","[2018-04-19, 2018-04-20]",[2018-05-03],"[2018-04-26, 2018-04-27, … 2018-05-14]"
"[2017-11-28, 2017-11-29, … 2018-01-05]","[2017-11-28, 2017-11-29]",[2018-01-03],"[2017-12-10, 2017-12-11, … 2018-01-05]"


In [115]:
trending_dates_by_channel.select(
    'trending_date',
    pl.col('trending_date').list.gather([0, -1]).alias('first_and_last'),
    pl.col('trending_date').list.gather([0, 0, 0, 0]).alias('first_repeated'),
    pl.col('trending_date').list.gather([0, 10], null_on_oob=True).alias('first_and_10th_or_null'),
).head()

trending_date,first_and_last,first_repeated,first_and_10th_or_null
list[date],list[date],list[date],list[date]
"[2018-01-24, 2018-01-25, … 2018-02-12]","[2018-01-24, 2018-02-12]","[2018-01-24, 2018-01-24, … 2018-01-24]","[2018-01-24, 2018-02-12]"
"[2017-12-02, 2017-12-03, … 2017-12-07]","[2017-12-02, 2017-12-07]","[2017-12-02, 2017-12-02, … 2017-12-02]","[2017-12-02, null]"
"[2017-12-29, 2017-12-30, … 2018-03-03]","[2017-12-29, 2018-03-03]","[2017-12-29, 2017-12-29, … 2017-12-29]","[2017-12-29, 2018-02-09]"
"[2018-04-19, 2018-04-20, … 2018-05-14]","[2018-04-19, 2018-05-14]","[2018-04-19, 2018-04-19, … 2018-04-19]","[2018-04-19, 2018-04-29]"
"[2017-11-28, 2017-11-29, … 2018-01-05]","[2017-11-28, 2018-01-05]","[2017-11-28, 2017-11-28, … 2017-11-28]","[2017-11-28, 2017-12-24]"


### There is more...

In [168]:
(
    df
    .group_by('trending_date')
    .agg('category_id')
    .with_columns(pl.col('category_id').list.sort())
    .with_columns(
        pl.col('category_id'),
        pl.col('category_id').list.len().alias('category_id_cnt'),
        pl.col('category_id').list.unique().alias('category_id_unique'),
        pl.col('category_id').list.unique().list.len().alias('category_id_unique_cnt')
    )
).head()

trending_date,category_id,category_id_cnt,category_id_unique,category_id_unique_cnt
date,list[i64],u32,list[i64],u32
2018-02-13,"[1, 1, … 28]",199,"[1, 2, … 28]",14
2018-03-09,"[1, 1, … 28]",200,"[1, 10, … 28]",13
2018-02-28,"[1, 1, … 28]",199,"[1, 2, … 28]",14
2018-02-19,"[1, 1, … 29]",200,"[1, 2, … 29]",15
2018-03-06,"[1, 1, … 28]",200,"[1, 2, … 28]",14


In [9]:
trending_dates_by_channel.with_columns(
    pl.col("trending_date")
    .list.sample(n=3, with_replacement=True, seed=0)
    .alias("samples")
).head()

NameError: name 'trending_dates_by_channel' is not defined

## Applying logic to each element in lists

### Getting ready

In [172]:
agg_df = (
    df
    .group_by('trending_date')
    .agg('views', 'channel_title')
)

In [173]:
agg_df.head()

trending_date,views,channel_title
date,list[i64],list[str]
2018-02-13,"[266874, 953801, … 736748]","[""NBC Sports"", ""The King of Random"", … ""SmarterEveryDay""]"
2018-03-03,"[716096, 1108125, … 1327815]","[""carrieunderwoodVEVO"", ""nigahiga"", … ""Brian Hull""]"
2018-02-25,"[237081, 1747075, … 295145]","[""Thomas Sanders"", ""Breakfast Club Power 105.1 FM"", … ""Crusoe the Celebrity Dachshund""]"
2018-03-12,"[1611772, 1498364, … 422979]","[""Saturday Night Live"", ""Liza Koshy Too"", … ""BBC News""]"
2018-03-09,"[904177, 3008740, … 1159497]","[""Nintendo"", ""CamilaCabelloVEVO"", … ""Hannah Stocking""]"


### How to do it...

In [174]:
(
    agg_df
    .select(
        'views',
        pl.col('views').list.eval(pl.element()).alias('pl.element'),
        pl.col('views').list.eval(pl.first()).alias('pl.first'),
        pl.col('views').list.eval(pl.last()).alias('pl.last'),
        pl.col('views').list.eval(pl.col('')).alias('pl.col')
    )
).head()

views,pl.element,pl.first,pl.last,pl.col
list[i64],list[i64],list[i64],list[i64],list[i64]
"[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]"
"[716096, 1108125, … 1327815]","[716096, 1108125, … 1327815]","[716096, 1108125, … 1327815]","[716096, 1108125, … 1327815]","[716096, 1108125, … 1327815]"
"[237081, 1747075, … 295145]","[237081, 1747075, … 295145]","[237081, 1747075, … 295145]","[237081, 1747075, … 295145]","[237081, 1747075, … 295145]"
"[1611772, 1498364, … 422979]","[1611772, 1498364, … 422979]","[1611772, 1498364, … 422979]","[1611772, 1498364, … 422979]","[1611772, 1498364, … 422979]"
"[904177, 3008740, … 1159497]","[904177, 3008740, … 1159497]","[904177, 3008740, … 1159497]","[904177, 3008740, … 1159497]","[904177, 3008740, … 1159497]"


In [175]:
channel_titles_df = (
    agg_df
    .select(
        pl.col("channel_title").list.head(2),
        pl.col("channel_title")
        .list.eval(pl.element().str.to_uppercase())
        .list.head(2)
        .alias("channel_title_upper")
    )
)
channel_titles_df.head()

channel_title,channel_title_upper
list[str],list[str]
"[""NBC Sports"", ""The King of Random""]","[""NBC SPORTS"", ""THE KING OF RANDOM""]"
"[""carrieunderwoodVEVO"", ""nigahiga""]","[""CARRIEUNDERWOODVEVO"", ""NIGAHIGA""]"
"[""Thomas Sanders"", ""Breakfast Club Power 105.1 FM""]","[""THOMAS SANDERS"", ""BREAKFAST CLUB POWER 105.1 FM""]"
"[""Saturday Night Live"", ""Liza Koshy Too""]","[""SATURDAY NIGHT LIVE"", ""LIZA KOSHY TOO""]"
"[""Nintendo"", ""CamilaCabelloVEVO""]","[""NINTENDO"", ""CAMILACABELLOVEVO""]"


In [176]:
(
    channel_titles_df
    .with_columns(
        pl.col('channel_title_upper')
        .list.eval(
            pl.element().filter(
                pl.element().str.contains('A', literal=True)
            )
        )
    )
).head()

channel_title,channel_title_upper
list[str],list[str]
"[""NBC Sports"", ""The King of Random""]","[""THE KING OF RANDOM""]"
"[""carrieunderwoodVEVO"", ""nigahiga""]","[""CARRIEUNDERWOODVEVO"", ""NIGAHIGA""]"
"[""Thomas Sanders"", ""Breakfast Club Power 105.1 FM""]","[""THOMAS SANDERS"", ""BREAKFAST CLUB POWER 105.1 FM""]"
"[""Saturday Night Live"", ""Liza Koshy Too""]","[""SATURDAY NIGHT LIVE"", ""LIZA KOSHY TOO""]"
"[""Nintendo"", ""CamilaCabelloVEVO""]","[""CAMILACABELLOVEVO""]"


In [177]:
(
    agg_df
    .select(
        'trending_date',
        'views',
        pl.col('views')
        .list.eval(pl.element().rank('dense', descending=True))
        .alias('views_rank'),
    )
).head()

trending_date,views,views_rank
date,list[i64],list[u32]
2018-02-13,"[266874, 953801, … 736748]","[133, 66, … 79]"
2018-03-03,"[716096, 1108125, … 1327815]","[80, 55, … 47]"
2018-02-25,"[237081, 1747075, … 295145]","[115, 35, … 109]"
2018-03-12,"[1611772, 1498364, … 422979]","[54, 61, … 132]"
2018-03-09,"[904177, 3008740, … 1159497]","[75, 18, … 65]"


In [178]:
views_rank_df = (
    agg_df
    .select(
        'trending_date',
        'views',
        pl.col('views')
        .list.eval(pl.element().rank('dense', descending=True))
        .alias('views_rank'),
    )
)
views_rank_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2018-02-13,"[266874, 953801, … 736748]","[133, 66, … 79]"
2018-03-03,"[716096, 1108125, … 1327815]","[80, 55, … 47]"
2018-02-25,"[237081, 1747075, … 295145]","[115, 35, … 109]"
2018-03-12,"[1611772, 1498364, … 422979]","[54, 61, … 132]"
2018-03-09,"[904177, 3008740, … 1159497]","[75, 18, … 65]"


In [179]:
top3_views_df = (
    views_rank_df
    .explode('views', 'views_rank')
    .filter(pl.col('views_rank')<=3)
    .group_by('trending_date')
    .agg(pl.all())
)
top3_views_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2017-11-29,"[30583293, 6969128, 19670964]","[1, 3, 2]"
2018-06-02,"[32523416, 42700273, 225211923]","[3, 2, 1]"
2018-04-19,"[52556278, 36908726, 39814522]","[1, 3, 2]"
2018-05-21,"[65396157, 162556776, 60207478]","[2, 1, 3]"
2018-02-03,"[15643726, 19392316, 10878906]","[2, 1, 3]"


In [180]:
(
    views_rank_df
    .explode('views', 'views_rank')
).head()

trending_date,views,views_rank
date,i64,u32
2018-02-13,266874,133
2018-02-13,953801,66
2018-02-13,638618,85
2018-02-13,319444,122
2018-02-13,3780988,23


In [181]:
(
    top3_views_df
    .select(
        'views_rank',
        'views',
        pl.col('views')
        .list.eval(pl.element().max() - pl.element())
        .alias('diff from the most views')
    )
).head()

views_rank,views,diff from the most views
list[u32],list[i64],list[i64]
"[1, 3, 2]","[30583293, 6969128, 19670964]","[0, 23614165, 10912329]"
"[3, 2, 1]","[32523416, 42700273, 225211923]","[192688507, 182511650, 0]"
"[1, 3, 2]","[52556278, 36908726, 39814522]","[0, 15647552, 12741756]"
"[2, 1, 3]","[65396157, 162556776, 60207478]","[97160619, 0, 102349298]"
"[2, 1, 3]","[15643726, 19392316, 10878906]","[3748590, 0, 8513410]"


### There is more...

In [184]:
top3_views_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2017-11-29,"[30583293, 6969128, 19670964]","[1, 3, 2]"
2018-06-02,"[32523416, 42700273, 225211923]","[3, 2, 1]"
2018-04-19,"[52556278, 36908726, 39814522]","[1, 3, 2]"
2018-05-21,"[65396157, 162556776, 60207478]","[2, 1, 3]"
2018-02-03,"[15643726, 19392316, 10878906]","[2, 1, 3]"


In [232]:
(
    top3_views_df
    .with_columns(
        pl.col('views_rank').list.slice(0, 2).alias('views_rank_1'),
        pl.col('views_rank').list.slice(-2, 2).alias('views_rank_2')
    )
    .select(
        'views_rank_1',
        'views_rank_2',
        pl.col('views_rank_1').list.set_intersection('views_rank_2')
        .alias('intersection'),
        pl.col('views_rank_1').list.set_union('views_rank_2')
        .alias('union'),
        pl.col('views_rank_1').list.set_difference('views_rank_2')
        .alias('difference'),
        pl.col('views_rank_1').list.set_symmetric_difference('views_rank_2')
        .alias('symmetric_difference'),
    )
).head()

views_rank_1,views_rank_2,intersection,union,difference,symmetric_difference
list[u32],list[u32],list[u32],list[u32],list[u32],list[u32]
"[1, 3]","[3, 2]",[3],"[1, 3, 2]",[1],"[1, 2]"
"[3, 2]","[2, 1]",[2],"[3, 2, 1]",[3],"[3, 1]"
"[1, 3]","[3, 2]",[3],"[1, 3, 2]",[1],"[1, 2]"
"[2, 1]","[1, 3]",[1],"[2, 1, 3]",[2],"[2, 3]"
"[2, 1]","[1, 3]",[1],"[2, 1, 3]",[2],"[2, 3]"


## Working with structs

### Getting ready

In [367]:
import polars as pl

In [368]:
df = pl.read_json('../data/ga_20170801.json')
df.head()

visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,clientId,channelGrouping,socialEngagementType
null,str,str,str,str,struct[13],struct[9],struct[17],struct[11],list[struct[2]],list[struct[33]],str,null,null,str,str
,"""1""","""1501591568""","""1501591568""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Europe"",""Southern Europe"",""Greece"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""tellas.gr"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}",[],"[{{""0"",""1"",null},null,{""(not set)"",""Bags"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},""1"",null,{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""0""},[],""PAGE"",[],""true"",""https://www.google.gr/"",null,""web"",""46"",""true"",null,null,{null,""true"",null,null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],[],[],null,""5"",null,{""/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/bags/"",""/google+zipper+front+sports+bag.axd"",""""},[],[],""true"",null,null,""0""}]","""34183340117798…",,,"""Organic Search…","""Not Socially E…"
,"""2""","""1501589647""","""1501589647""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Asia"",""Southern Asia"",""India"",""Maharashtra"",""(not set)"",""Mumbai"",""not available in demo dataset"",""unknown.unknown"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""APAC""}]","[{{""0"",""1"",null},null,{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},""1"",null,{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},[],""PAGE"",[],""true"",""https://analytics.google.com/analytics/web/"",null,""web"",""14"",""true"",null,null,{null,""true"",null,null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],[],[],null,""5"",null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],[],""true"",null,null,""0""}]","""24743978550413…",,,"""Referral""","""Not Socially E…"
,"""1""","""1501616621""","""1501616621""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Europe"",""Northern Europe"",""United Kingdom"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""as9105.com"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""EMEA""}]","[{{""0"",""1"",null},null,{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},""1"",null,{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},[],""PAGE"",[],""true"",""https://analytics.google.com/analytics/web/?utm_source=demoaccount&utm_medium=demoaccount&utm_campaign=demoaccount"",null,""web"",""43"",""true"",null,null,{null,""true"",null,null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],[],[],null,""12"",null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],[],""true"",null,null,""0""}]","""58704628207131…",,,"""Referral""","""Not Socially E…"
,"""1""","""1501601200""","""1501601200""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Firefox"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Americas"",""Northern America"",""United States"",""Texas"",""Dallas-Ft. Worth TX"",""Dallas"",""not available in demo dataset"",""h5colo.com"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""North America""}]","[{{""0"",""1"",null},null,{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},""1"",null,{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},[],""PAGE"",[],""true"",""https://analytics.google.com/analytics/web/"",null,""web"",""26"",""true"",null,null,{null,""true"",null,null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],[],[],null,""8"",null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],[],""true"",null,null,""0""}]","""93978091713494…",,,"""Referral""","""Not Socially E…"
,"""1""","""1501615525""","""1501615525""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Americas"",""Northern America"",""United States"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""(not set)"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""North America""}]","[{{""0"",""1"",null},null,{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},""1"",null,{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},[],""PAGE"",[],""true"",""https://adwords.google.com/analytics/web/?hl=en_US&__o=cues&authuser=0"",null,""web"",""25"",""true"",null,null,{null,""true"",null,null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],[],[],null,""12"",null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],[],""true"",null,null,""0""}]","""60899029431845…",,,"""Referral""","""Not Socially E…"


In [369]:
cols = ['visitId', 'date', 'totals', 'trafficSource', 'customDimensions', 'channelGrouping']
df = df.select(cols)
df.head()

visitId,date,totals,trafficSource,customDimensions,channelGrouping
str,str,struct[13],struct[9],list[struct[2]],str
"""1501591568""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}",[],"""Organic Search…"
"""1501589647""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""APAC""}]","""Referral"""
"""1501616621""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""EMEA""}]","""Referral"""
"""1501601200""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]","""Referral"""
"""1501615525""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]","""Referral"""


In [370]:
df.select('totals', 'trafficSource', 'customDimensions').head()

totals,trafficSource,customDimensions
struct[13],struct[9],list[struct[2]]
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}",[]
"{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""APAC""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""EMEA""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]"


### How to do it...

In [372]:
df = df.with_columns(
    pl.struct('visitId', 'date', 'channelGrouping').alias('structFromCols')
)

In [373]:
(
    df
    .select(
        'visitId',
        'date',
        'channelGrouping',
        pl.struct('visitId', 'date', 'channelGrouping').alias('structFromCols')
    )
).head()

visitId,date,channelGrouping,structFromCols
str,str,str,struct[3]
"""1501591568""","""20170801""","""Organic Search…","{""1501591568"",""20170801"",""Organic Search""}"
"""1501589647""","""20170801""","""Referral""","{""1501589647"",""20170801"",""Referral""}"
"""1501616621""","""20170801""","""Referral""","{""1501616621"",""20170801"",""Referral""}"
"""1501601200""","""20170801""","""Referral""","{""1501601200"",""20170801"",""Referral""}"
"""1501615525""","""20170801""","""Referral""","{""1501615525"",""20170801"",""Referral""}"


In [450]:
(
    df
    .group_by('channelGrouping')
    .agg(
        'visitId', 
        pl.col('visitId').len().alias('numVisits')
    )
    .sort('numVisits')
    .with_columns(
        pl.col('visitId').list.to_struct().alias('struct_from_list')
    )   
)

channelGrouping,visitId,numVisits,struct_from_list
str,list[str],u32,struct[12]
"""Display""","[""1501651856"", ""1501625928"", … ""1501638116""]",12,"{""1501651856"",""1501625928"",""1501611633"",""1501625068"",""1501612878"",""1501616158"",""1501607332"",""1501622703"",""1501621231"",""1501649570"",""1501647417"",""1501638116""}"
"""Paid Search""","[""1501610896"", ""1501644116"", … ""1501613648""]",20,"{""1501610896"",""1501644116"",""1501574187"",""1501614708"",""1501625398"",""1501636143"",""1501617844"",""1501628199"",""1501618328"",""1501624893"",""1501650118"",""1501580936""}"
"""Affiliates""","[""1501604627"", ""1501572101"", … ""1501635918""]",29,"{""1501604627"",""1501572101"",""1501638418"",""1501589595"",""1501635523"",""1501588687"",""1501602677"",""1501656633"",""1501589444"",""1501654572"",""1501588961"",""1501588902""}"
"""Referral""","[""1501589647"", ""1501616621"", … ""1501607798""]",106,"{""1501589647"",""1501616621"",""1501601200"",""1501615525"",""1501589650"",""1501573710"",""1501613382"",""1501630140"",""1501656976"",""1501602227"",""1501620300"",""1501611288""}"
"""Social""","[""1501590147"", ""1501655923"", … ""1501652602""]",136,"{""1501590147"",""1501655923"",""1501640054"",""1501596419"",""1501591307"",""1501616949"",""1501649584"",""1501579329"",""1501585058"",""1501618027"",""1501653304"",""1501614595""}"
"""Direct""","[""1501586309"", ""1501587435"", … ""1501610792""]",163,"{""1501586309"",""1501587435"",""1501653660"",""1501608816"",""1501611913"",""1501584277"",""1501578373"",""1501587465"",""1501621325"",""1501655032"",""1501622827"",""1501575169""}"
"""Organic Search…","[""1501591568"", ""1501583103"", … ""1501625964""]",534,"{""1501591568"",""1501583103"",""1501631547"",""1501599064"",""1501585229"",""1501639903"",""1501576309"",""1501573981"",""1501618526"",""1501578968"",""1501599268"",""1501596177""}"


In [441]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').alias('structFromColsToBeUnpacked')
    )
    .unnest('structFromColsToBeUnpacked')
).head()

structFromCols,visitId,date,channelGrouping
struct[3],str,str,str
"{""1501591568"",""20170801"",""Organic Search""}","""1501591568""","""20170801""","""Organic Search…"
"{""1501589647"",""20170801"",""Referral""}","""1501589647""","""20170801""","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""1501616621""","""20170801""","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""1501601200""","""20170801""","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""1501615525""","""20170801""","""Referral"""


In [509]:
(
    df
    .select(
        pl.col('trafficSource')
    )
    .unnest('trafficSource')
).head()

referralPath,campaign,source,medium,keyword,adContent,adwordsClickInfo,isTrueDirect,campaignCode
str,str,str,str,str,null,struct[12],str,null
,"""(not set)""","""(direct)""","""(none)""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web…","""(not set)""","""analytics.goog…","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web…","""(not set)""","""analytics.goog…","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web…","""(not set)""","""analytics.goog…","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web…","""(not set)""","""adwords.google…","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,


In [444]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').struct.rename_fields(['a', 'b', 'c']).alias('renamedStructToBeUnpacked')
    )
    .unnest('renamedStructToBeUnpacked')
).head()

structFromCols,a,b,c
struct[3],str,str,str
"{""1501591568"",""20170801"",""Organic Search""}","""1501591568""","""20170801""","""Organic Search…"
"{""1501589647"",""20170801"",""Referral""}","""1501589647""","""20170801""","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""1501616621""","""20170801""","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""1501601200""","""20170801""","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""1501615525""","""20170801""","""Referral"""


In [533]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').struct.field('channelGrouping')
    )
).head()

structFromCols,channelGrouping
struct[3],str
"{""1501591568"",""20170801"",""Organic Search""}","""Organic Search…"
"{""1501589647"",""20170801"",""Referral""}","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""Referral"""


In [529]:
(
    df
    .select(
        pl.struct(
            pl.col('channelGrouping'),
            pl.col('trafficSource').struct.field('source')
        )
        .unique()
        .alias('channelAndSource')
    )
    .unnest('channelAndSource')
    .sort('channelGrouping', 'source')
)

channelGrouping,source
str,str
"""Affiliates""","""Partners"""
"""Direct""","""(direct)"""
"""Display""","""(direct)"""
"""Display""","""dfa"""
"""Organic Search…","""(direct)"""
"""Organic Search…","""ask"""
"""Organic Search…","""baidu"""
"""Paid Search""","""(direct)"""
"""Referral""","""(direct)"""
"""Referral""","""adwords.google…"


### There is more...

In [545]:
total_struct_to_str_expr = pl.col('totals').struct.json_encode()
(
    df
    .select(
        total_struct_to_str_expr.alias('total_str'),
        total_struct_to_str_expr
        .str.json_decode()
        .alias('total_struct')
    )
).head()

total_str,total_struct
str,struct[13]
"""{""visits"":""1"",…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",…","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}"
"""{""visits"":""1"",…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
