# Chapter 7: Working with Nested Data Structures

## Technical Requirements

In [1]:
import polars as pl

In [2]:
df = pl.read_csv("../data/us_videos.csv", try_parse_dates=True)
df.glimpse(max_items_per_column=2)

Rows: 40949
Columns: 16
$ video_id                             <str> '2kyS6SvSYSE', '1ZAPwfrtAFY'
$ trending_date                        <str> '17.14.11', '17.14.11'
$ title                                <str> 'WE WANT TO TALK ABOUT OUR MARRIAGE', 'The Trump Presidency: Last Week Tonight with John Oliver (HBO)'
$ channel_title                        <str> 'CaseyNeistat', 'LastWeekTonight'
$ category_id                          <i64> 22, 24
$ publish_time           <datetime[μs, UTC]> 2017-11-13 17:13:01+00:00, 2017-11-13 07:30:00+00:00
$ tags                                 <str> 'SHANtell martin', 'last week tonight trump presidency|last week tonight donald trump|john oliver trump|donald trump'
$ views                                <i64> 748374, 2418783
$ likes                                <i64> 57527, 97185
$ dislikes                             <i64> 2966, 6146
$ comment_count                        <i64> 15954, 12703
$ thumbnail_link                       <str> 'https://i.ytimg

In [3]:
df.head()

video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
str,str,str,str,i64,"datetime[μs, UTC]",str,i64,i64,i64,i64,str,bool,bool,bool,str
"""2kyS6SvSYSE""","""17.14.11""","""WE WANT TO TALK ABOUT OUR MARR…","""CaseyNeistat""",22,2017-11-13 17:13:01 UTC,"""SHANtell martin""",748374,57527,2966,15954,"""https://i.ytimg.com/vi/2kyS6Sv…",False,False,False,"""SHANTELL'S CHANNEL - https://w…"
"""1ZAPwfrtAFY""","""17.14.11""","""The Trump Presidency: Last Wee…","""LastWeekTonight""",24,2017-11-13 07:30:00 UTC,"""last week tonight trump presid…",2418783,97185,6146,12703,"""https://i.ytimg.com/vi/1ZAPwfr…",False,False,False,"""One year after the presidentia…"
"""5qpjK5DgCt4""","""17.14.11""","""Racist Superman | Rudy Mancuso…","""Rudy Mancuso""",23,2017-11-12 19:05:24 UTC,"""racist superman|rudy|mancuso|k…",3191434,146033,5339,8181,"""https://i.ytimg.com/vi/5qpjK5D…",False,False,False,"""WATCH MY PREVIOUS VIDEO ▶ \n\n…"
"""puqaWrEC7tY""","""17.14.11""","""Nickelback Lyrics: Real or Fak…","""Good Mythical Morning""",24,2017-11-13 11:00:04 UTC,"""rhett and link|gmm|good mythic…",343168,10172,666,2146,"""https://i.ytimg.com/vi/puqaWrE…",False,False,False,"""Today we find out if Link is a…"
"""d380meD0W0M""","""17.14.11""","""I Dare You: GOING BALD!?""","""nigahiga""",24,2017-11-12 18:01:41 UTC,"""ryan|higa|higatv|nigahiga|i da…",2095731,132235,1989,17518,"""https://i.ytimg.com/vi/d380meD…",False,False,False,"""I know it's been a while since…"


In [4]:
df = df.with_columns(pl.col("trending_date").str.to_date(format="%y.%d.%m"))

In [5]:
df.select('trending_date').dtypes[0]

Date

## Creating lists

### How to do it...

In [6]:
df.select(
    'tags',
    pl.col('tags').str.split('|').alias('tags in list')
).head()    

tags,tags in list
str,list[str]
"""SHANtell martin""","[""SHANtell martin""]"
"""last week tonight trump presid…","[""last week tonight trump presidency"", ""last week tonight donald trump"", … ""donald trump""]"
"""racist superman|rudy|mancuso|k…","[""racist superman"", ""rudy"", … "" Lele Pons""]"
"""rhett and link|gmm|good mythic…","[""rhett and link"", ""gmm"", … ""challenge""]"
"""ryan|higa|higatv|nigahiga|i da…","[""ryan"", ""higa"", … ""fail""]"


In [7]:
(
    df
    .group_by('trending_date')
    .agg(pl.col('video_id'))
    .sort('trending_date', descending=True)
).head()

trending_date,video_id
date,list[str]
2018-06-14,"[""-QPdRfqTnt4"", ""gPHVLxm8U-0"", … ""ooyjaVdt-jA""]"
2018-06-13,"[""FchkqXEg0qs"", ""uHRwMmwbFnA"", … ""Q5KmA3Xbmqo""]"
2018-06-12,"[""PPWDwBrUNyY"", ""rAH8qm5oQHg"", … ""6S9c5nnDd_s""]"
2018-06-11,"[""0bXCbVGb04A"", ""L4pkD78oKSo"", … ""r-3iathMo7o""]"
2018-06-10,"[""L4pkD78oKSo"", ""ZFwylDNpgFc"", … ""r-3iathMo7o""]"


In [8]:
df.select(
    pl.concat_list(
        pl.col('views'),
        pl.col('likes'),
        pl.col('dislikes'),
        pl.col('comment_count')
    ).alias('engagement')
).head()

engagement
list[i64]
"[748374, 57527, … 15954]"
"[2418783, 97185, … 12703]"
"[3191434, 146033, … 8181]"
"[343168, 10172, … 2146]"
"[2095731, 132235, … 17518]"


### There is more...

In [11]:
df = pl.DataFrame(
    {
        'nested_list': [
            [
                [1,2,3], [4,5,6], 
                [7,8,9], [10,11,12]
            ], 
            [
                [1,2,3], [4,5,6], 
                [7,8,9], [10,11,12]
            ]
        ]
    }
)

In [12]:
df

nested_list
list[list[i64]]
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"


In [13]:
df = pl.DataFrame({
    'nested_list': [
        [
            ['a',2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ], 
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ]
    ]
}, strict=False)

In [14]:
df

nested_list
list[list[str]]
"[[""a"", ""2"", ""3""], [""4"", ""5"", ""6""], … [""10"", ""11"", ""12""]]"
"[[""1"", ""2"", ""3""], [""4"", ""5"", ""6""], … [""10"", ""11"", ""12""]]"


## Aggregating elements in lists

### How to do it...

In [15]:
import polars as pl

In [16]:
df = (
    pl.read_csv('../data/us_videos.csv', try_parse_dates=True)
    .with_columns(
        pl.col('trending_date').str.to_date(format='%y.%d.%m')
    )
)
df.head()

video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
str,date,str,str,i64,"datetime[μs, UTC]",str,i64,i64,i64,i64,str,bool,bool,bool,str
"""2kyS6SvSYSE""",2017-11-14,"""WE WANT TO TALK ABOUT OUR MARR…","""CaseyNeistat""",22,2017-11-13 17:13:01 UTC,"""SHANtell martin""",748374,57527,2966,15954,"""https://i.ytimg.com/vi/2kyS6Sv…",False,False,False,"""SHANTELL'S CHANNEL - https://w…"
"""1ZAPwfrtAFY""",2017-11-14,"""The Trump Presidency: Last Wee…","""LastWeekTonight""",24,2017-11-13 07:30:00 UTC,"""last week tonight trump presid…",2418783,97185,6146,12703,"""https://i.ytimg.com/vi/1ZAPwfr…",False,False,False,"""One year after the presidentia…"
"""5qpjK5DgCt4""",2017-11-14,"""Racist Superman | Rudy Mancuso…","""Rudy Mancuso""",23,2017-11-12 19:05:24 UTC,"""racist superman|rudy|mancuso|k…",3191434,146033,5339,8181,"""https://i.ytimg.com/vi/5qpjK5D…",False,False,False,"""WATCH MY PREVIOUS VIDEO ▶ \n\n…"
"""puqaWrEC7tY""",2017-11-14,"""Nickelback Lyrics: Real or Fak…","""Good Mythical Morning""",24,2017-11-13 11:00:04 UTC,"""rhett and link|gmm|good mythic…",343168,10172,666,2146,"""https://i.ytimg.com/vi/puqaWrE…",False,False,False,"""Today we find out if Link is a…"
"""d380meD0W0M""",2017-11-14,"""I Dare You: GOING BALD!?""","""nigahiga""",24,2017-11-12 18:01:41 UTC,"""ryan|higa|higatv|nigahiga|i da…",2095731,132235,1989,17518,"""https://i.ytimg.com/vi/d380meD…",False,False,False,"""I know it's been a while since…"


In [17]:
agg_df = (
    df
    .group_by('trending_date')
    .agg(
        'views',
        'likes',
        'dislikes',
        'comment_count'
    )
    .sort('trending_date', descending=True)
)
agg_df.head()

trending_date,views,likes,dislikes,comment_count
date,list[i64],list[i64],list[i64],list[i64]
2018-06-14,"[4427381, 5829270, … 10306119]","[96391, 87323, … 357079]","[5508, 3668, … 212976]","[12726, 11933, … 144795]"
2018-06-13,"[3238183, 470844, … 7839668]","[61841, 13922, … 352352]","[3708, 402, … 5871]","[0, 4843, … 46624]"
2018-06-12,"[3483553, 6173038, … 13619534]","[23725, 90478, … 347100]","[3145, 3877, … 6923]","[462, 7726, … 19977]"
2018-06-11,"[2341772, 846887, … 6995168]","[140374, 5758, … 143678]","[2951, 850, … 10925]","[33760, 1539, … 17444]"
2018-06-10,"[594004, 1504564, … 6980540]","[4470, 32430, … 143452]","[520, 4316, … 10911]","[1195, 6105, … 17429]"


In [18]:
(
    agg_df
    .select(
        'trending_date',
        pl.col('views').list.min().alias('views_min'),
        pl.col('likes').list.max().alias('likes_max'),
        pl.col('dislikes').list.mean().alias('dislikes_mean'),
        pl.col('comment_count').list.sum().alias('comment_sum'),
    )
    .sort('trending_date', descending=True)
).head()

trending_date,views_min,likes_max,dislikes_mean,comment_sum
date,i64,i64,f64,i64
2018-06-14,189038,2032463,8472.94,3289433
2018-06-13,175754,2021395,8541.23,3379632
2018-06-12,161782,2004753,8353.685,3352328
2018-06-11,136643,1981942,8478.095,3320709
2018-06-10,116841,1967904,8405.22,3351105


In [19]:
(
    df
    .group_by('trending_date')
    .agg(pl.col('channel_title'))
    .with_columns(
        pl.col('channel_title').list.join(':')
    )
    .sort('trending_date', descending=True)
).head()

trending_date,channel_title
date,str
2018-06-14,"""Disney Movie Trailers:America'…"
2018-06-13,"""Nintendo:GameXplain:gameslice:…"
2018-06-12,"""gameslice:Clash of Clans:Famil…"
2018-06-11,"""The Game Theorists:NBC Sports:…"
2018-06-10,"""NBC Sports:Anthem Game:Univers…"


### There is more...

In [20]:
(
    agg_df
    .select(
        'trending_date',
        pl.col('views').list.len().alias('item_cnt')
    )
).head()

trending_date,item_cnt
date,u32
2018-06-14,200
2018-06-13,200
2018-06-12,200
2018-06-11,200
2018-06-10,200


## Accessing and selecting elements in lists

### Getting ready

In [86]:
import polars as pl

In [87]:
trending_dates_by_channel = (
    df
    .group_by('channel_title')
    .agg('trending_date')
    .with_columns(pl.col('trending_date').list.sort())
)

In [88]:
trending_dates_by_channel.head()

channel_title,trending_date
str,list[date]
"""Drew Lynch""","[2018-01-24, 2018-01-25, … 2018-02-12]"
"""FIFATV""","[2017-12-02, 2017-12-03, … 2017-12-07]"
"""SmarterEveryDa…","[2017-12-29, 2017-12-30, … 2018-03-03]"
"""GingerPale""","[2018-04-19, 2018-04-20, … 2018-05-14]"
"""Linkin Park""","[2017-11-28, 2017-11-29, … 2018-01-05]"


### How to do it...

In [22]:
trending_dates_by_channel = (
    df
    .group_by('channel_title')
    .agg('trending_date')
    .with_columns(pl.col('trending_date').list.sort())
)

trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.first().alias('first_trending_date'),
    pl.col('trending_date').list.last().alias('last_trending_date')
).head()

channel_title,trending_date,first_trending_date,last_trending_date
str,list[date],date,date
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]",2018-01-19,2018-06-14
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]",2017-11-17,2018-01-24
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]",2017-12-08,2018-02-28
"""EVNautilus""",[2018-01-08],2018-01-08,2018-01-08
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]",2018-03-17,2018-03-29


In [23]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.get(7, null_on_oob=True).alias('8th_element')
).head()

channel_title,trending_date,8th_element
str,list[date],date
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]",2018-03-03
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]",2018-01-22
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]",2017-12-15
"""EVNautilus""",[2018-01-08],
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]",2018-03-24


In [24]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date').list.head().alias('first_5'),
    pl.col('trending_date').list.tail(10).alias('last_10')
).head()

channel_title,trending_date,first_5,last_10
str,list[date],list[date],list[date]
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-01-19, 2018-01-20, … 2018-01-23]","[2018-06-03, 2018-06-04, … 2018-06-14]"
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]","[2017-11-17, 2017-11-18, … 2017-11-23]","[2017-11-17, 2017-11-18, … 2018-01-24]"
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]","[2017-12-08, 2017-12-09, … 2017-12-12]","[2018-02-19, 2018-02-20, … 2018-02-28]"
"""EVNautilus""",[2018-01-08],[2018-01-08],[2018-01-08]
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-17, 2018-03-18, … 2018-03-21]","[2018-03-20, 2018-03-21, … 2018-03-29]"


In [25]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date')
    .list.sort(descending=True)
    .list.head(3)
    .alias('3_most_recent_dates')
).head()

channel_title,trending_date,3_most_recent_dates
str,list[date],list[date]
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-06-14, 2018-06-11, 2018-06-10]"
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]","[2018-01-24, 2018-01-23, 2018-01-22]"
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]","[2018-02-28, 2018-02-27, 2018-02-26]"
"""EVNautilus""",[2018-01-08],[2018-01-08]
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-29, 2018-03-28, 2018-03-27]"


In [26]:
trending_dates_by_channel.with_columns(
    pl.col('trending_date')
    .list.sort(descending=True)
    .list.head(3)
    .alias('3_most_recent_dates')
).head()

channel_title,trending_date,3_most_recent_dates
str,list[date],list[date]
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-06-14, 2018-06-11, 2018-06-10]"
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]","[2018-01-24, 2018-01-23, 2018-01-22]"
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]","[2018-02-28, 2018-02-27, 2018-02-26]"
"""EVNautilus""",[2018-01-08],[2018-01-08]
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-29, 2018-03-28, 2018-03-27]"


In [27]:
trending_dates_by_channel.select(
    'trending_date',
    pl.col('trending_date').list.slice(0, 2).alias('first_2_dates'),
    pl.col('trending_date').list.slice(-3, 1).alias('3rd_date_to_last'),
    pl.col('trending_date').list.slice(7).alias('from_8th_date_to_end')
).head()

trending_date,first_2_dates,3rd_date_to_last,from_8th_date_to_end
list[date],list[date],list[date],list[date]
"[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-01-19, 2018-01-20]",[2018-06-10],"[2018-03-03, 2018-03-04, … 2018-06-14]"
"[2017-11-17, 2017-11-18, … 2018-01-24]","[2017-11-17, 2017-11-18]",[2018-01-22],"[2018-01-22, 2018-01-23, 2018-01-24]"
"[2017-12-08, 2017-12-09, … 2018-02-28]","[2017-12-08, 2017-12-09]",[2018-02-26],"[2017-12-15, 2017-12-16, … 2018-02-28]"
[2018-01-08],[2018-01-08],[],[]
"[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-17, 2018-03-18]",[2018-03-27],"[2018-03-24, 2018-03-25, … 2018-03-29]"


In [28]:
trending_dates_by_channel.select(
    'trending_date',
    pl.col('trending_date').list.gather([0, -1]).alias('first_and_last'),
    pl.col('trending_date').list.gather([0, 0, 0, 0]).alias('first_repeated'),
    pl.col('trending_date').list.gather([0, 10], null_on_oob=True).alias('first_and_10th_or_null'),
).head()

trending_date,first_and_last,first_repeated,first_and_10th_or_null
list[date],list[date],list[date],list[date]
"[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-01-19, 2018-06-14]","[2018-01-19, 2018-01-19, … 2018-01-19]","[2018-01-19, 2018-03-06]"
"[2017-11-17, 2017-11-18, … 2018-01-24]","[2017-11-17, 2018-01-24]","[2017-11-17, 2017-11-17, … 2017-11-17]","[2017-11-17, null]"
"[2017-12-08, 2017-12-09, … 2018-02-28]","[2017-12-08, 2018-02-28]","[2017-12-08, 2017-12-08, … 2017-12-08]","[2017-12-08, 2018-01-19]"
[2018-01-08],"[2018-01-08, 2018-01-08]","[2018-01-08, 2018-01-08, … 2018-01-08]","[2018-01-08, null]"
"[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-17, 2018-03-29]","[2018-03-17, 2018-03-17, … 2018-03-17]","[2018-03-17, 2018-03-27]"


### There is more...

In [29]:
(
    df
    .group_by('trending_date')
    .agg('category_id')
    .with_columns(pl.col('category_id').list.sort())
    .with_columns(
        pl.col('category_id'),
        pl.col('category_id').list.len().alias('category_id_cnt'),
        pl.col('category_id').list.unique().alias('category_id_unique'),
        pl.col('category_id').list.unique().list.len().alias('category_id_unique_cnt')
    )
).head()

trending_date,category_id,category_id_cnt,category_id_unique,category_id_unique_cnt
date,list[i64],u32,list[i64],u32
2017-12-13,"[1, 1, … 28]",200,"[1, 2, … 28]",14
2018-02-13,"[1, 1, … 28]",199,"[1, 2, … 28]",14
2018-05-26,"[1, 1, … 43]",200,"[1, 10, … 43]",14
2018-06-13,"[1, 1, … 28]",200,"[1, 10, … 28]",13
2018-02-28,"[1, 1, … 28]",199,"[1, 2, … 28]",14


In [30]:
trending_dates_by_channel.with_columns(
    pl.col("trending_date")
    .list.sample(n=3, with_replacement=True, seed=0)
    .alias("samples")
).head()

channel_title,trending_date,samples
str,list[date],list[date]
"""Laura Lee""","[2018-01-19, 2018-01-20, … 2018-06-14]","[2018-04-19, 2018-04-18, 2018-06-11]"
"""Eric Strebel""","[2017-11-17, 2017-11-18, … 2018-01-24]","[2017-11-23, 2017-11-23, 2018-01-24]"
"""MSNBC""","[2017-12-08, 2017-12-09, … 2018-02-28]","[2018-01-28, 2018-01-27, 2018-02-28]"
"""EVNautilus""",[2018-01-08],"[2018-01-08, 2018-01-08, 2018-01-08]"
"""Warped Perception""","[2018-03-17, 2018-03-18, … 2018-03-29]","[2018-03-22, 2018-03-22, 2018-03-29]"


## Applying logic to each element in lists

### Getting ready

In [31]:
agg_df = (
    df
    .group_by('trending_date')
    .agg('views', 'channel_title')
)

In [173]:
agg_df.head()

trending_date,views,channel_title
date,list[i64],list[str]
2018-02-13,"[266874, 953801, … 736748]","[""NBC Sports"", ""The King of Random"", … ""SmarterEveryDay""]"
2018-03-03,"[716096, 1108125, … 1327815]","[""carrieunderwoodVEVO"", ""nigahiga"", … ""Brian Hull""]"
2018-02-25,"[237081, 1747075, … 295145]","[""Thomas Sanders"", ""Breakfast Club Power 105.1 FM"", … ""Crusoe the Celebrity Dachshund""]"
2018-03-12,"[1611772, 1498364, … 422979]","[""Saturday Night Live"", ""Liza Koshy Too"", … ""BBC News""]"
2018-03-09,"[904177, 3008740, … 1159497]","[""Nintendo"", ""CamilaCabelloVEVO"", … ""Hannah Stocking""]"


### How to do it...

In [34]:
# This code snippet performs the following operations on the DataFrame `agg_df`:
# 
# 1. Selects the 'views' column.
# 2. Evaluates the 'views' column as a list and applies the `pl.element()` function, renaming the result to 'pl.element'.
# 3. Evaluates the 'views' column as a list and applies the `pl.first()` function, renaming the result to 'pl.first'.
# 4. Evaluates the 'views' column as a list and applies the `pl.last()` function, renaming the result to 'pl.last'.
# 5. Evaluates the 'views' column as a list and applies the `pl.col('')` function, renaming the result to 'pl.col'.
# 
# Finally, it returns the first few rows of the resulting DataFrame using the `head()` method.

agg_df.select(
    'views',
    pl.col('views').list.eval(pl.element()).alias('pl.element'),
    pl.col('views').list.eval(pl.first()).alias('pl.first'),
    pl.col('views').list.eval(pl.last()).alias('pl.last'),
    pl.col('views').list.eval(pl.col('')).alias('pl.col')
).head()

views,pl.element,pl.first,pl.last,pl.col
list[i64],list[i64],list[i64],list[i64],list[i64]
"[466511, 337908, … 31990]","[466511, 337908, … 31990]","[466511, 337908, … 31990]","[466511, 337908, … 31990]","[466511, 337908, … 31990]"
"[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]","[266874, 953801, … 736748]"
"[2108246, 851859, … 1139279]","[2108246, 851859, … 1139279]","[2108246, 851859, … 1139279]","[2108246, 851859, … 1139279]","[2108246, 851859, … 1139279]"
"[2131702, 1659538, … 11415161]","[2131702, 1659538, … 11415161]","[2131702, 1659538, … 11415161]","[2131702, 1659538, … 11415161]","[2131702, 1659538, … 11415161]"
"[2768013, 1355968, … 2666775]","[2768013, 1355968, … 2666775]","[2768013, 1355968, … 2666775]","[2768013, 1355968, … 2666775]","[2768013, 1355968, … 2666775]"


This code snippet demonstrates how to manipulate a DataFrame 

agg_df

 using the `polars` library in Python. The primary focus is on the 'views' column, which is evaluated as a list and subjected to various operations to create new columns.

The process begins by selecting the 'views' column from the DataFrame 

agg_df

 using the 

select

 method. This method allows for the selection and transformation of specific columns in the DataFrame.

Next, the 

pl.col('views').list.eval(pl.element())

 function is used to evaluate the 'views' column as a list and apply the 

pl.element()

 function to each element in the list. The result is then renamed to 'pl.element' using the 

alias('pl.element')

 method. This new column contains the individual elements of the 'views' list.

Similarly, the 

pl.col('views').list.eval(pl.first())

 function evaluates the 'views' column as a list and applies the 

pl.first()

 function, which extracts the first element of each list. The result is renamed to 'pl.first' using the 

alias('pl.first')

 method.

The 

pl.col('views').list.eval(pl.last())

 function evaluates the 'views' column as a list and applies the 

pl.last()

 function, which extracts the last element of each list. The result is renamed to 'pl.last' using the 

alias('pl.last')

 method.

Lastly, the 

pl.col('views').list.eval(pl.col(''))

 function evaluates the 'views' column as a list and applies the 

pl.col('')

 function. The result is renamed to 'pl.col' using the 

alias('pl.col')

 method. This operation is somewhat ambiguous without additional context, but it appears to be a placeholder for further column manipulation.

Finally, the 

head()

 method is called to display the first few rows of the resulting DataFrame. This allows for a quick inspection of the changes made to the DataFrame, specifically the addition of the new columns 'pl.element', 'pl.first', 'pl.last', and 'pl.col'.

In summary, this code snippet selects the 'views' column from the DataFrame 

agg_df

, evaluates it as a list, and applies various functions to create new columns. The resulting DataFrame includes the original 'views' column and the new columns 'pl.element', 'pl.first', 'pl.last', and 'pl.col', with the first few rows displayed for inspection.

In [35]:
channel_titles_df = (
    agg_df
    .select(
        pl.col("channel_title").list.head(2),
        pl.col("channel_title")
        .list.eval(pl.element().str.to_uppercase())
        .list.head(2)
        .alias("channel_title_upper")
    )
)
channel_titles_df.head()

channel_title,channel_title_upper
list[str],list[str]
"[""Rob Bliss"", ""BuzzFeed Celeb""]","[""ROB BLISS"", ""BUZZFEED CELEB""]"
"[""NBC Sports"", ""The King of Random""]","[""NBC SPORTS"", ""THE KING OF RANDOM""]"
"[""HBO"", ""Will Smith""]","[""HBO"", ""WILL SMITH""]"
"[""Lucas and Marcus"", ""BuzzFeedBlue""]","[""LUCAS AND MARCUS"", ""BUZZFEEDBLUE""]"
"[""CaseyNeistat"", ""Butch Hartman""]","[""CASEYNEISTAT"", ""BUTCH HARTMAN""]"


In [36]:
(
    channel_titles_df
    .with_columns(
        pl.col('channel_title_upper')
        .list.eval(
            pl.element().filter(
                pl.element().str.contains('A', literal=True)
            )
        )
    )
).head()

channel_title,channel_title_upper
list[str],list[str]
"[""Rob Bliss"", ""BuzzFeed Celeb""]",[]
"[""NBC Sports"", ""The King of Random""]","[""THE KING OF RANDOM""]"
"[""HBO"", ""Will Smith""]",[]
"[""Lucas and Marcus"", ""BuzzFeedBlue""]","[""LUCAS AND MARCUS""]"
"[""CaseyNeistat"", ""Butch Hartman""]","[""CASEYNEISTAT"", ""BUTCH HARTMAN""]"


In [37]:
(
    agg_df
    .select(
        'trending_date',
        'views',
        pl.col('views')
        .list.eval(pl.element().rank('dense', descending=True))
        .alias('views_rank'),
    )
).head()

trending_date,views,views_rank
date,list[i64],list[u32]
2017-12-13,"[466511, 337908, … 31990]","[80, 89, … 176]"
2018-02-13,"[266874, 953801, … 736748]","[133, 66, … 79]"
2018-03-30,"[2108246, 851859, … 1139279]","[45, 102, … 74]"
2018-05-14,"[2131702, 1659538, … 11415161]","[86, 103, … 23]"
2018-02-10,"[2768013, 1355968, … 2666775]","[23, 45, … 24]"


In [38]:
views_rank_df = (
    agg_df
    .select(
        'trending_date',
        'views',
        pl.col('views')
        .list.eval(pl.element().rank('dense', descending=True))
        .alias('views_rank'),
    )
)
views_rank_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2017-12-13,"[466511, 337908, … 31990]","[80, 89, … 176]"
2018-02-13,"[266874, 953801, … 736748]","[133, 66, … 79]"
2018-03-30,"[2108246, 851859, … 1139279]","[45, 102, … 74]"
2018-05-14,"[2131702, 1659538, … 11415161]","[86, 103, … 23]"
2018-02-10,"[2768013, 1355968, … 2666775]","[23, 45, … 24]"


In [39]:
top3_views_df = (
    views_rank_df
    .explode('views', 'views_rank')
    .filter(pl.col('views_rank')<=3)
    .group_by('trending_date')
    .agg(pl.all())
)
top3_views_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2018-05-23,"[80738011, 173478072, 65455235]","[2, 1, 3]"
2018-01-31,"[20814284, 12458711, 23742391]","[2, 3, 1]"
2018-04-18,"[50247353, 34179417, 38212279]","[1, 3, 2]"
2018-04-28,"[49185287, 61920752, 58899002]","[3, 1, 2]"
2018-06-12,"[85159766, 39391109, 40870858]","[1, 3, 2]"


In [40]:
(
    views_rank_df
    .explode('views', 'views_rank')
).head()

trending_date,views,views_rank
date,i64,u32
2017-12-13,466511,80
2017-12-13,337908,89
2017-12-13,1151705,46
2017-12-13,1314496,42
2017-12-13,2880365,17


In [41]:
(
    top3_views_df
    .select(
        'views_rank',
        'views',
        pl.col('views')
        .list.eval(pl.element().max() - pl.element())
        .alias('diff from the most views')
    )
).head()

views_rank,views,diff from the most views
list[u32],list[i64],list[i64]
"[2, 1, 3]","[80738011, 173478072, 65455235]","[92740061, 0, 108022837]"
"[2, 3, 1]","[20814284, 12458711, 23742391]","[2928107, 11283680, 0]"
"[1, 3, 2]","[50247353, 34179417, 38212279]","[0, 16067936, 12035074]"
"[3, 1, 2]","[49185287, 61920752, 58899002]","[12735465, 0, 3021750]"
"[1, 3, 2]","[85159766, 39391109, 40870858]","[0, 45768657, 44288908]"


### There is more...

In [42]:
top3_views_df.head()

trending_date,views,views_rank
date,list[i64],list[u32]
2018-05-23,"[80738011, 173478072, 65455235]","[2, 1, 3]"
2018-01-31,"[20814284, 12458711, 23742391]","[2, 3, 1]"
2018-04-18,"[50247353, 34179417, 38212279]","[1, 3, 2]"
2018-04-28,"[49185287, 61920752, 58899002]","[3, 1, 2]"
2018-06-12,"[85159766, 39391109, 40870858]","[1, 3, 2]"


In [43]:
(
    top3_views_df
    .with_columns(
        pl.col('views_rank').list.slice(0, 2).alias('views_rank_1'),
        pl.col('views_rank').list.slice(-2, 2).alias('views_rank_2')
    )
    .select(
        'views_rank_1',
        'views_rank_2',
        pl.col('views_rank_1').list.set_intersection('views_rank_2')
        .alias('intersection'),
        pl.col('views_rank_1').list.set_union('views_rank_2')
        .alias('union'),
        pl.col('views_rank_1').list.set_difference('views_rank_2')
        .alias('difference'),
        pl.col('views_rank_1').list.set_symmetric_difference('views_rank_2')
        .alias('symmetric_difference'),
    )
).head()

views_rank_1,views_rank_2,intersection,union,difference,symmetric_difference
list[u32],list[u32],list[u32],list[u32],list[u32],list[u32]
"[2, 1]","[1, 3]",[1],"[2, 1, 3]",[2],"[2, 3]"
"[2, 3]","[3, 1]",[3],"[2, 3, 1]",[2],"[2, 1]"
"[1, 3]","[3, 2]",[3],"[1, 3, 2]",[1],"[1, 2]"
"[3, 1]","[1, 2]",[1],"[3, 1, 2]",[3],"[3, 2]"
"[1, 3]","[3, 2]",[3],"[1, 3, 2]",[1],"[1, 2]"


## Working with structs and JSON data

### Getting ready

In [44]:
import polars as pl

In [45]:
df = pl.read_json('../data/ga_20170801.json')
df.head()

visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,clientId,channelGrouping,socialEngagementType
null,str,str,str,str,struct[13],struct[9],struct[17],struct[11],list[struct[2]],list[struct[33]],str,null,null,str,str
,"""1""","""1501591568""","""1501591568""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Europe"",""Southern Europe"",""Greece"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""tellas.gr"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}",[],"[{[],null,{""/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/bags/"",""/google+zipper+front+sports+bag.axd"",""""},[],""true"",[],""PAGE"",null,""1"",""5"",null,null,null,""true"",{""(not set)"",""Bags"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},{null,""true"",null,null},null,""0"",""web"",null,[],[],[],{""0"",""1"",null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""shop.googlemerchandisestore.com/google+redesign/bags/google+zipper+front+sports+bag.axd"",""0""},null,""46"",null,""https://www.google.gr/"",""true""}]","""3418334011779872055""",,,"""Organic Search""","""Not Socially Engaged"""
,"""2""","""1501589647""","""1501589647""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Asia"",""Southern Asia"",""India"",""Maharashtra"",""(not set)"",""Mumbai"",""not available in demo dataset"",""unknown.unknown"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""APAC""}]","[{[],null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],""true"",[],""PAGE"",null,""1"",""5"",null,null,null,""true"",{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},{null,""true"",null,null},null,""0"",""web"",null,[],[],[],{""0"",""1"",null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},null,""14"",null,""https://analytics.google.com/analytics/web/"",""true""}]","""2474397855041322408""",,,"""Referral""","""Not Socially Engaged"""
,"""1""","""1501616621""","""1501616621""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Europe"",""Northern Europe"",""United Kingdom"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""as9105.com"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""EMEA""}]","[{[],null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],""true"",[],""PAGE"",null,""1"",""12"",null,null,null,""true"",{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},{null,""true"",null,null},null,""0"",""web"",null,[],[],[],{""0"",""1"",null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},null,""43"",null,""https://analytics.google.com/analytics/web/?utm_source=demoaccount&utm_medium=demoaccount&utm_campaign=demoaccount"",""true""}]","""5870462820713110108""",,,"""Referral""","""Not Socially Engaged"""
,"""1""","""1501601200""","""1501601200""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Firefox"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Americas"",""Northern America"",""United States"",""Texas"",""Dallas-Ft. Worth TX"",""Dallas"",""not available in demo dataset"",""h5colo.com"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""North America""}]","[{[],null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],""true"",[],""PAGE"",null,""1"",""8"",null,null,null,""true"",{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},{null,""true"",null,null},null,""0"",""web"",null,[],[],[],{""0"",""1"",null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},null,""26"",null,""https://analytics.google.com/analytics/web/"",""true""}]","""9397809171349480379""",,,"""Referral""","""Not Socially Engaged"""
,"""1""","""1501615525""","""1501615525""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","{""Chrome"",""not available in demo dataset"",""not available in demo dataset"",""Windows"",""not available in demo dataset"",""false"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",null,""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""desktop""}","{""Americas"",""Northern America"",""United States"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset"",""(not set)"",""not available in demo dataset"",""not available in demo dataset"",""not available in demo dataset""}","[{""4"",""North America""}]","[{[],null,{""/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com"",""Page Unavailable"",null,null,""/google+redesign/"",""/shop+by+brand/"",""/youtube"",""""},[],""true"",[],""PAGE"",null,""1"",""12"",null,null,null,""true"",{""(not set)"",""Brands"",""(not set)"",""(not set)"",""(not set)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",""(entrance)"",null,""1"",null,null,null},{null,""true"",null,null},null,""0"",""web"",null,[],[],[],{""0"",""1"",null},null,{null,null,null,null,""(not set)"",null,""No"","" : ""},[],{null,null,null,null,null,null,null,null,""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""shop.googlemerchandisestore.com/google+redesign/shop+by+brand/youtube"",""0""},null,""25"",null,""https://adwords.google.com/analytics/web/?hl=en_US&__o=cues&authuser=0"",""true""}]","""6089902943184578335""",,,"""Referral""","""Not Socially Engaged"""


In [46]:
cols = ['visitId', 'date', 'totals', 'trafficSource', 'customDimensions', 'channelGrouping']
df = df.select(cols)
df.head()

visitId,date,totals,trafficSource,customDimensions,channelGrouping
str,str,struct[13],struct[9],list[struct[2]],str
"""1501591568""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}",[],"""Organic Search"""
"""1501589647""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""APAC""}]","""Referral"""
"""1501616621""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""EMEA""}]","""Referral"""
"""1501601200""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]","""Referral"""
"""1501615525""","""20170801""","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]","""Referral"""


In [47]:
df.select('totals', 'trafficSource', 'customDimensions').head()

totals,trafficSource,customDimensions
struct[13],struct[9],list[struct[2]]
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{null,""(not set)"",""(direct)"",""(none)"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}",[]
"{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""APAC""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""EMEA""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""analytics.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]"
"{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}","{""/analytics/web/"",""(not set)"",""adwords.google.com"",""referral"",null,null,{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null},null,null}","[{""4"",""North America""}]"


### How to do it...

In [48]:
df = df.with_columns(
    pl.struct('visitId', 'date', 'channelGrouping').alias('structFromCols')
)

In [49]:
(
    df
    .select(
        'visitId',
        'date',
        'channelGrouping',
        pl.struct('visitId', 'date', 'channelGrouping').alias('structFromCols')
    )
).head()

visitId,date,channelGrouping,structFromCols
str,str,str,struct[3]
"""1501591568""","""20170801""","""Organic Search""","{""1501591568"",""20170801"",""Organic Search""}"
"""1501589647""","""20170801""","""Referral""","{""1501589647"",""20170801"",""Referral""}"
"""1501616621""","""20170801""","""Referral""","{""1501616621"",""20170801"",""Referral""}"
"""1501601200""","""20170801""","""Referral""","{""1501601200"",""20170801"",""Referral""}"
"""1501615525""","""20170801""","""Referral""","{""1501615525"",""20170801"",""Referral""}"


In [50]:
(
    df
    .group_by('channelGrouping')
    .agg(
        'visitId', 
        pl.col('visitId').len().alias('numVisits')
    )
    .sort('numVisits')
    .with_columns(
        pl.col('visitId').list.to_struct().alias('struct_from_list')
    )   
)

channelGrouping,visitId,numVisits,struct_from_list
str,list[str],u32,struct[12]
"""Display""","[""1501651856"", ""1501625928"", … ""1501638116""]",12,"{""1501651856"",""1501625928"",""1501611633"",""1501625068"",""1501612878"",""1501616158"",""1501607332"",""1501622703"",""1501621231"",""1501649570"",""1501647417"",""1501638116""}"
"""Paid Search""","[""1501610896"", ""1501644116"", … ""1501613648""]",20,"{""1501610896"",""1501644116"",""1501574187"",""1501614708"",""1501625398"",""1501636143"",""1501617844"",""1501628199"",""1501618328"",""1501624893"",""1501650118"",""1501580936""}"
"""Affiliates""","[""1501604627"", ""1501572101"", … ""1501635918""]",29,"{""1501604627"",""1501572101"",""1501638418"",""1501589595"",""1501635523"",""1501588687"",""1501602677"",""1501656633"",""1501589444"",""1501654572"",""1501588961"",""1501588902""}"
"""Referral""","[""1501589647"", ""1501616621"", … ""1501607798""]",106,"{""1501589647"",""1501616621"",""1501601200"",""1501615525"",""1501589650"",""1501573710"",""1501613382"",""1501630140"",""1501656976"",""1501602227"",""1501620300"",""1501611288""}"
"""Social""","[""1501590147"", ""1501655923"", … ""1501652602""]",136,"{""1501590147"",""1501655923"",""1501640054"",""1501596419"",""1501591307"",""1501616949"",""1501649584"",""1501579329"",""1501585058"",""1501618027"",""1501653304"",""1501614595""}"
"""Direct""","[""1501586309"", ""1501587435"", … ""1501610792""]",163,"{""1501586309"",""1501587435"",""1501653660"",""1501608816"",""1501611913"",""1501584277"",""1501578373"",""1501587465"",""1501621325"",""1501655032"",""1501622827"",""1501575169""}"
"""Organic Search""","[""1501591568"", ""1501583103"", … ""1501625964""]",534,"{""1501591568"",""1501583103"",""1501631547"",""1501599064"",""1501585229"",""1501639903"",""1501576309"",""1501573981"",""1501618526"",""1501578968"",""1501599268"",""1501596177""}"


In [52]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').alias('structFromColsToBeUnpacked')
    )
    .unnest('structFromColsToBeUnpacked')
).head()

structFromCols,visitId,date,channelGrouping
struct[3],str,str,str
"{""1501591568"",""20170801"",""Organic Search""}","""1501591568""","""20170801""","""Organic Search"""
"{""1501589647"",""20170801"",""Referral""}","""1501589647""","""20170801""","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""1501616621""","""20170801""","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""1501601200""","""20170801""","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""1501615525""","""20170801""","""Referral"""


In [53]:
(
    df
    .select(
        pl.col('trafficSource')
    )
    .unnest('trafficSource')
).head()

referralPath,campaign,source,medium,keyword,adContent,adwordsClickInfo,isTrueDirect,campaignCode
str,str,str,str,str,null,struct[12],str,null
,"""(not set)""","""(direct)""","""(none)""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web/""","""(not set)""","""analytics.google.com""","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web/""","""(not set)""","""analytics.google.com""","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web/""","""(not set)""","""analytics.google.com""","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,
"""/analytics/web/""","""(not set)""","""adwords.google.com""","""referral""",,,"{null,null,null,null,null,null,""not available in demo dataset"",null,null,null,null,null}",,


In [54]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').struct.rename_fields(['a', 'b', 'c']).alias('renamedStructToBeUnpacked')
    )
    .unnest('renamedStructToBeUnpacked')
).head()

structFromCols,a,b,c
struct[3],str,str,str
"{""1501591568"",""20170801"",""Organic Search""}","""1501591568""","""20170801""","""Organic Search"""
"{""1501589647"",""20170801"",""Referral""}","""1501589647""","""20170801""","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""1501616621""","""20170801""","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""1501601200""","""20170801""","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""1501615525""","""20170801""","""Referral"""


In [55]:
(
    df
    .select(
        'structFromCols',
        pl.col('structFromCols').struct.field('channelGrouping')
    )
).head()

structFromCols,channelGrouping
struct[3],str
"{""1501591568"",""20170801"",""Organic Search""}","""Organic Search"""
"{""1501589647"",""20170801"",""Referral""}","""Referral"""
"{""1501616621"",""20170801"",""Referral""}","""Referral"""
"{""1501601200"",""20170801"",""Referral""}","""Referral"""
"{""1501615525"",""20170801"",""Referral""}","""Referral"""


In [56]:
(
    df
    .select(
        pl.struct(
            pl.col('channelGrouping'),
            pl.col('trafficSource').struct.field('source')
        )
        .unique()
        .alias('channelAndSource')
    )
    .unnest('channelAndSource')
    .sort('channelGrouping', 'source')
)

channelGrouping,source
str,str
"""Affiliates""","""Partners"""
"""Direct""","""(direct)"""
"""Display""","""(direct)"""
"""Display""","""dfa"""
"""Organic Search""","""(direct)"""
…,…
"""Social""","""groups.google.com"""
"""Social""","""l.facebook.com"""
"""Social""","""m.facebook.com"""
"""Social""","""quora.com"""


### There is more...

In [57]:
total_struct_to_str_expr = pl.col('totals').struct.json_encode()
(
    df
    .select(
        total_struct_to_str_expr.alias('total_str'),
        total_struct_to_str_expr
        .str.json_decode()
        .alias('total_struct')
    )
).head()

total_str,total_struct
str,struct[13]
"""{""visits"":""1"",""hits"":""1"",""page…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",""hits"":""1"",""page…","{""1"",""1"",""1"",null,""1"",null,null,null,null,null,null,null,""1""}"
"""{""visits"":""1"",""hits"":""1"",""page…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",""hits"":""1"",""page…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
"""{""visits"":""1"",""hits"":""1"",""page…","{""1"",""1"",""1"",null,""1"",null,null,""1"",null,null,null,null,""1""}"
