In [2]:
import polars as pl
from glob import glob
import json
from pathlib import Path
from datetime import datetime

In [12]:
movies = []

rating_history = 2
for movie in glob("../res/movie_data/*.json"):

    moviedata = {}
    movie = Path(movie)
    if " " in movie.stem:
        #print(f"skipping {movie.stem}")
        continue
    moviedata["id"] = movie.stem

    # get release date
    with open(movie) as f:
        movieinfo = json.load(f)
        moviedata["release_date"] = movieinfo["release_date"]
    
    try:
        with open(f"../res/graph_data/{movie.stem}.json") as f:
         measures = json.load(f)

         # ordered from earliest to latest
         last_measures = measures["data"][0]["y"][-5:]
         last_update = measures["data"][0]["x"][-1]
         moviedata["measures"] = last_measures
         moviedata["last_update"] = last_update
    except FileNotFoundError:
        pass

    movies.append(moviedata)
df = pl.DataFrame(movies)
df = df.with_columns(pl.col("last_update").apply(lambda x: datetime.fromisoformat(x)))
df = df.with_columns(pl.col("release_date").str.strptime(pl.Date, "%Y-%m-%d", strict=False))
df
    # movie = Path(movie)
    # movie =pl.read_json(movie)

id,release_date,measures,last_update
str,date,list[f64],datetime[μs]
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159


In [13]:

# compute derivative-oïde of last measures
# idea is to get a value representing the amount of changes 

op = pl.element().rolling_apply(lambda x: abs(x[1] - x[0]), 2).sum()
df = df.with_columns(pl.col("measures").arr.eval(op, parallel=True).alias("note_change"))
df

id,release_date,measures,last_update,note_change
str,date,list[f64],datetime[μs],list[f64]
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01]
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01]
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0]
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0]
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0]
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0]
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0]
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0]
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0]
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0]


In [14]:
# compute days since release
# TODO: make this something so that most recent have a high value?
#       just keep release_date's UNIX timestamp?
now = datetime.now()
df = df.with_columns((now - pl.col("release_date")).dt.days().alias("days_since_release"))
df

id,release_date,measures,last_update,note_change,days_since_release
str,date,list[f64],datetime[μs],list[f64],i64
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01],8465
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01],136
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0],2516
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0],3551
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0],8900
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0],4223
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0],267
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0],1134
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0],10076
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0],8374


In [15]:
df = df.with_columns(((now - pl.col("last_update")).dt.days()).alias("days_since_update"))
df

id,release_date,measures,last_update,note_change,days_since_release,days_since_update
str,date,list[f64],datetime[μs],list[f64],i64,i64
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01],8465,3
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01],136,1
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0],2516,3
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0],3551,2
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0],8900,19
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0],4223,2
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0],267,5
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0],1134,6
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0],10076,0
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0],8374,1


In [18]:
df = df.with_columns([
    (1/pl.col("days_since_release")).log().alias("A"), # days since release metric
    (pl.col("days_since_update").exp().alias("B")), # days since update metric 
    (pl.col("note_change").arr.get(-1).alias("C"))
    ])
#df.with_columns((pl.col("days_since_release").log()).alias("A2"))

df.sort(by="C")


id,release_date,measures,last_update,note_change,days_since_release,days_since_update,A,B,C
str,date,list[f64],datetime[μs],list[f64],i64,i64,f64,f64,f64
"""promising-youn...",2020-12-13,"[3.72, 3.72]",2023-03-26 12:32:43.801638,[null],846,12,-6.740519,162754.791419,
"""man-of-steel""",2013-06-12,"[2.97, 2.97]",2023-03-24 01:25:28.514978,[null],3587,15,-8.185071,3.2690e6,
"""spencer-2021""",2021-11-04,"[3.73, 3.73]",2023-03-22 01:26:03.678485,[null],520,17,-6.253829,2.4155e7,
"""nobody-2021""",2021-03-18,"[3.55, 3.55]",2023-03-24 01:25:21.608175,[null],751,15,-6.621406,3.2690e6,
"""halloween""",1978-10-24,,,,16237,,-9.695048,,
"""easy-a""",2010-09-16,"[3.46, 3.46]",2023-04-06 12:33:30.776338,[null],4587,1,-8.430981,2.718282,
"""top-gun""",1986-05-16,"[3.35, 3.35]",2023-03-24 01:25:27.060825,[null],13476,15,-9.508666,3.2690e6,
"""tangled""",2010-11-24,,,,4518,,-8.415825,,
"""spring-breaker...",2013-01-24,"[2.82, 2.82]",2023-03-28 12:37:49.614545,[null],3726,10,-8.223091,22026.465795,
"""air""",2023-04-05,,,,3,,-1.098612,,
