In [142]:
import polars as pl
from glob import glob
import json
from pathlib import Path
from datetime import datetime

In [143]:
movies = []

rating_history = 2
for movie in glob("../res/movie_data/*.json"):

    moviedata = {}
    movie = Path(movie)
    if " " in movie.stem:
        #print(f"skipping {movie.stem}")
        continue
    moviedata["id"] = movie.stem

    # get release date
    with open(movie) as f:
        movieinfo = json.load(f)
        moviedata["release_date"] = movieinfo["release_date"]
    
    try:
        with open(f"../res/graph_data/{movie.stem}.json") as f:
         measures = json.load(f)

         # ordered from earliest to latest
         last_measures = measures["data"][0]["y"][-5:]
         last_update = measures["data"][0]["x"][-1]
         moviedata["measures"] = last_measures
         moviedata["last_update"] = last_update
    except FileNotFoundError:
        pass

    movies.append(moviedata)
df = pl.DataFrame(movies)
df = df.with_columns(pl.col("last_update").apply(lambda x: datetime.fromisoformat(x)))
df = df.with_columns(pl.col("release_date").str.strptime(pl.Date, "%Y-%m-%d", strict=False))
df
    # movie = Path(movie)
    # movie =pl.read_json(movie)

id,release_date,measures,last_update
str,date,list[f64],datetime[μs]
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159


In [144]:

# compute derivative-oïde of last measures
# idea is to get a value representing the amount of changes 
# TODO: Find a way of managing when you have rathing history = 5 and nb measures = 2
def sum_changes(x):
    try:
        return abs(x[1] - x[0])
    except IndexError:
        return 0

#op = pl.element().rolling_apply(lambda x: abs(x[1] - x[0]), 2, min_periods=2).sum()
op = pl.element().rolling_apply(sum_changes, rating_history).sum()
df = df.with_columns(pl.col("measures").arr.eval(op, parallel=True).alias("note_change"))
df

id,release_date,measures,last_update,note_change
str,date,list[f64],datetime[μs],list[f64]
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01]
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01]
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0]
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0]
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0]
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0]
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0]
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0]
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0]
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0]


In [145]:
# compute days since release
# TODO: make this something so that most recent have a high value?
#       just keep release_date's UNIX timestamp?
now = datetime.now()
df = df.with_columns((now - pl.col("release_date")).dt.days().alias("days_since_release"))
df

id,release_date,measures,last_update,note_change,days_since_release
str,date,list[f64],datetime[μs],list[f64],i64
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01],8465
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01],136
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0],2516
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0],3551
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0],8900
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0],4223
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0],267
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0],1134
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0],10076
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0],8374


In [146]:
df = df.with_columns(((now - pl.col("last_update")).dt.days()).alias("days_since_update"))
df

id,release_date,measures,last_update,note_change,days_since_release,days_since_update
str,date,list[f64],datetime[μs],list[f64],i64,i64
"""scream-3""",2000-02-03,"[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,[0.01],8465,3
"""glass-onion""",2022-11-23,"[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,[0.01],136,2
"""x-men""",2016-05-18,"[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,[0.0],2516,3
"""the-conjuring""",2013-07-18,"[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,[0.0],3551,3
"""a-bugs-life""",1998-11-25,[3.43],2023-03-19 19:34:07.266026,[0.0],8900,20
"""drive-2011""",2011-09-15,"[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,[0.0],4223,3
"""where-the-craw...",2022-07-15,"[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,[0.0],267,5
"""onward-2020""",2020-02-29,"[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,[0.0],1134,6
"""fallen-angels""",1995-09-06,"[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,[0.0],10076,0
"""gladiator-2000...",2000-05-04,"[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,[0.0],8374,1


In [147]:
# try things with exp
df = df.with_columns([
    #(1/pl.col("days_since_release")).log().alias("A"), # days since release metric
    #(pl.col("days_since_update").exp().alias("B")), # days since update metric 
    (pl.col("note_change").arr.get(-1).fill_null(0).alias("note_variability"))
    ])
#df.with_columns((pl.col("days_since_release").log()).alias("A2"))



In [148]:
# try norm

df = df.with_columns([
    ( (pl.col("days_since_release") - pl.col("days_since_release").min()) / (pl.col("days_since_release").max() - pl.col("days_since_release").min())).alias("dsr_norm"), # days since release metric
    ( (pl.col("days_since_update") - pl.col("days_since_update").min()) / (pl.col("days_since_update").max() - pl.col("days_since_update").min())).alias("dsu_norm"), # days since update metric
    ( (pl.col("note_variability") - pl.col("note_variability").min()) / (pl.col("note_variability").max() - pl.col("note_variability").min())).alias("note_var_norm"), # days since update metric
    #(pl.col("days_since_update").exp().alias("B")), # days since update metric 
    #(pl.col("note_change").arr.get(-1).alias("C"))
    ])
#df.with_columns((pl.col("days_since_release").log()).alias("A2"))

df = df.with_columns([
    (1 -pl.col("dsr_norm")).abs().alias("inv_dsr_norm")
])

df.sort("days_since_release")


id,release_date,measures,last_update,note_change,days_since_release,days_since_update,note_variability,dsr_norm,dsu_norm,note_var_norm,inv_dsr_norm
str,date,list[f64],datetime[μs],list[f64],i64,i64,f64,f64,f64,f64,f64
"""walle""",,"[4.15, 4.15, ... 4.15]",2023-03-31 01:27:36.711676,[0.0],,8,0.0,,0.285714,0.0,
"""old-2021""",,"[2.53, 2.53, 2.53]",2023-04-08 01:20:14.222453,[0.0],,0,0.0,,0.0,0.0,
"""oppenheimer-20...",2023-07-19,,,,-101,,0.0,0.0,,0.0,1.0
"""barbie""",2023-07-19,,,,-101,,0.0,0.0,,0.0,1.0
"""oppenheimer""",2023-07-19,,,,-101,,0.0,0.0,,0.0,1.0
"""asteroid-city""",2023-06-09,,,,-61,,0.0,0.001305,,0.0,0.998695
"""past-lives""",2023-06-02,[4.21],2023-03-19 19:34:16.982882,[0.0],-54,20,0.0,0.001533,0.714286,0.0,0.998467
"""the-little-mer...",2023-05-24,"[3.67, 3.67, ... 3.67]",2023-04-03 12:34:20.420547,[0.0],-45,5,0.0,0.001827,0.178571,0.0,0.998173
"""evil-dead-rise...",2023-04-19,"[3.87, 3.88]",2023-03-30 12:36:47.826482,[null],-10,9,0.0,0.002969,0.321429,0.0,0.997031
"""beau-is-afraid...",2023-04-14,"[3.99, 3.99]",2023-04-08 01:20:12.871671,[null],-5,0,0.0,0.003132,0.0,0.0,0.996868


In [151]:
fac_var = 200
fac_dsr = 1
fac_dsu = 1
df = df.with_columns(
    ((fac_var * pl.col("note_var_norm") + fac_dsr * pl.col("inv_dsr_norm") + fac_dsu * pl.col("dsu_norm"))/(fac_var+fac_dsr+fac_dsu)).alias("h")
)

# filter out unreleased movies?
# Not sure, since sometimes unreleased movies already have notes
#df = df.with_columns(
#    [pl.when(pl.col("days_since_release") < 0).then(0).otherwise(pl.col("h")).alias("h")])
# TODO: Check if having a lot of measures can artificially buff heuristic. Idea: Divide total note_change by the number of measures.
df.sort("h")

id,release_date,measures,last_update,note_change,days_since_release,days_since_update,note_variability,dsr_norm,dsu_norm,note_var_norm,inv_dsr_norm,h
str,date,list[f64],datetime[μs],list[f64],i64,i64,f64,f64,f64,f64,f64,f64
"""halloween""",1978-10-24,,,,16237,,0.0,0.532998,,0.0,0.467002,
"""walle""",,"[4.15, 4.15, ... 4.15]",2023-03-31 01:27:36.711676,[0.0],,8,0.0,,0.285714,0.0,,
"""tangled""",2010-11-24,,,,4518,,0.0,0.150687,,0.0,0.849313,
"""air""",2023-04-05,,,,3,,0.0,0.003393,,0.0,0.996607,
"""luca""",2021-06-17,,,,660,,0.0,0.024826,,0.0,0.975174,
"""requiem""",2006-03-02,,,,6246,,0.0,0.20706,,0.0,0.79294,
"""oppenheimer-20...",2023-07-19,,,,-101,,0.0,0.0,,0.0,1.0,
"""men""",2022-05-20,,,,323,,0.0,0.013832,,0.0,0.986168,
"""blonde""",2022-09-16,,,,204,,0.0,0.00995,,0.0,0.99005,
"""paprika""",1991-02-13,,,,11742,,0.0,0.386357,,0.0,0.613643,
