# Working with Nested Data Structures

## Creating lists

### How to do it...

In [21]:
import polars as pl

In [106]:
df = pl.read_csv('../data/us_videos.csv', try_parse_dates=True)
df.glimpse(max_items_per_column=2)

Rows: 40949
Columns: 16
$ video_id                             <str> '2kyS6SvSYSE', '1ZAPwfrtAFY'
$ trending_date                        <str> '17.14.11', '17.14.11'
$ title                                <str> 'WE WANT TO TALK ABOUT OUR MARRIAGE', 'The Trump Presidency: Last Week Tonight with John Oliver (HBO)'
$ channel_title                        <str> 'CaseyNeistat', 'LastWeekTonight'
$ category_id                          <i64> 22, 24
$ publish_time           <datetime[μs, UTC]> 2017-11-13 17:13:01+00:00, 2017-11-13 07:30:00+00:00
$ tags                                 <str> 'SHANtell martin', 'last week tonight trump presidency|last week tonight donald trump|john oliver trump|donald trump'
$ views                                <i64> 748374, 2418783
$ likes                                <i64> 57527, 97185
$ dislikes                             <i64> 2966, 6146
$ comment_count                        <i64> 15954, 12703
$ thumbnail_link                       <str> 'https://i.ytimg

In [107]:
df.head()

video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
str,str,str,str,i64,"datetime[μs, UTC]",str,i64,i64,i64,i64,str,bool,bool,bool,str
"""2kyS6SvSYSE""","""17.14.11""","""WE WANT TO TAL…","""CaseyNeistat""",22,2017-11-13 17:13:01 UTC,"""SHANtell marti…",748374,57527,2966,15954,"""https://i.ytim…",False,False,False,"""SHANTELL'S CHA…"
"""1ZAPwfrtAFY""","""17.14.11""","""The Trump Pres…","""LastWeekTonigh…",24,2017-11-13 07:30:00 UTC,"""last week toni…",2418783,97185,6146,12703,"""https://i.ytim…",False,False,False,"""One year after…"
"""5qpjK5DgCt4""","""17.14.11""","""Racist Superma…","""Rudy Mancuso""",23,2017-11-12 19:05:24 UTC,"""racist superma…",3191434,146033,5339,8181,"""https://i.ytim…",False,False,False,"""WATCH MY PREVI…"
"""puqaWrEC7tY""","""17.14.11""","""Nickelback Lyr…","""Good Mythical …",24,2017-11-13 11:00:04 UTC,"""rhett and link…",343168,10172,666,2146,"""https://i.ytim…",False,False,False,"""Today we find …"
"""d380meD0W0M""","""17.14.11""","""I Dare You: GO…","""nigahiga""",24,2017-11-12 18:01:41 UTC,"""ryan|higa|higa…",2095731,132235,1989,17518,"""https://i.ytim…",False,False,False,"""I know it's be…"


In [108]:
df = df.with_columns(
    pl.col('trending_date').str.strptime(pl.Date, format='%y.%d.%m')
)

In [62]:
df.select('trending_date').dtypes[0]

Date

In [63]:
df.select(
    'tags',
    pl.col('tags').str.split('|').alias('tags in list')
).head()    

tags,tags in list
str,list[str]
"""SHANtell marti…","[""SHANtell martin""]"
"""last week toni…","[""last week tonight trump presidency"", ""last week tonight donald trump"", … ""donald trump""]"
"""racist superma…","[""racist superman"", ""rudy"", … "" Lele Pons""]"
"""rhett and link…","[""rhett and link"", ""gmm"", … ""challenge""]"
"""ryan|higa|higa…","[""ryan"", ""higa"", … ""fail""]"


In [67]:
(
    df
    .group_by('trending_date')
    .agg(pl.col('video_id'))
).head()

trending_date,video_id
date,list[str]
2018-03-03,"[""HgknAaKNaMM"", ""tugFFhML7VY"", … ""rZQepOFnYi8""]"
2018-02-28,"[""I8Umj580ls0"", ""YskVs5VyqHk"", … ""cy9W-ZywVPc""]"
2018-02-13,"[""uzK1OmxS4CE"", ""Nsm8l89x2H4"", … ""9reizHjwuNY""]"
2018-02-07,"[""wbSwFU6tY1c"", ""JQbjS0_ZfJ0"", … ""LtlkeMfbdpM""]"
2018-02-10,"[""7kLO2AB5SPM"", ""m4faDISwSVo"", … ""Bhplg8YCu-M""]"


In [76]:
df.select(
    pl.concat_list(
        pl.col('views'),
        pl.col('likes'),
        pl.col('dislikes'),
        pl.col('comment_count')
    ).alias('engagement')
).head()

engagement
list[i64]
"[748374, 57527, … 15954]"
"[2418783, 97185, … 12703]"
"[3191434, 146033, … 8181]"
"[343168, 10172, … 2146]"
"[2095731, 132235, … 17518]"


### There is more...

In [124]:
df = pl.DataFrame({
    'nested_list': [
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ], 
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ]
    ]
})


In [125]:
df

nested_list
list[list[i64]]
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"
"[[1, 2, 3], [4, 5, 6], … [10, 11, 12]]"


In [128]:
df = pl.DataFrame({
    'nested_list': [
        [
            ['a',2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ], 
        [
            [1,2,3], [4,5,6], 
            [7,8,9], [10,11,12]
        ]
    ]
})

In [129]:
df

nested_list
object
"[['a', 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]"
"[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]"
