In [139]:
import polars as pl
from glob import glob
import json
from pathlib import Path
from datetime import datetime

In [140]:
movies = []

rating_history = 2
for graphdata in glob("../res/graph_data/*.json"):
    moviedata = {}
    graphdata = Path(graphdata)
    if " " in graphdata.stem:
        #print(f"skipping {movie.stem}")
        continue
    moviedata["id"] = graphdata.stem

    try:
        with open(graphdata) as f:
         measures = json.load(f)

         # ordered from earliest to latest
         last_measures = measures["data"][0]["y"][-5:]
         last_update = measures["data"][0]["x"][-1]
         moviedata["measures"] = last_measures
         moviedata["last_update"] = last_update
    except FileNotFoundError:
        pass

    # get release date
    try:
        with open(f"../res/movie_data/{graphdata.stem}.json") as f:
         movieinfo = json.load(f)
         moviedata["release_date"] = movieinfo["release_date"]
    except FileNotFoundError:
       print(f"no movie info for {graphdata.stem}")
    
    movies.append(moviedata)
df = pl.DataFrame(movies)
df = df.with_columns(pl.col("last_update").apply(lambda x: datetime.fromisoformat(x)))
df = df.with_columns(pl.col("release_date").str.strptime(pl.Date, "%Y-%m-%d", strict=False))
df
    # movie = Path(movie)
    # movie =pl.read_json(movie)

no movie info for swarm-2023-1
no movie info for wandavision
no movie info for daisy-jones-the-six
no movie info for mh370-the-plane-that-disappeared
no movie info for squid-game


id,measures,last_update,release_date
str,list[f64],datetime[μs],date
"""scream-3""","[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,2000-02-03
"""glass-onion""","[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,2022-11-23
"""x-men""","[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,2016-05-18
"""the-conjuring""","[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,2013-07-18
"""a-bugs-life""","[3.43, 3.43]",2023-04-08 12:30:57.123853,1998-11-25
"""drive-2011""","[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,2011-09-15
"""where-the-craw...","[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,2022-07-15
"""onward-2020""","[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,2020-02-29
"""fallen-angels""","[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,1995-09-06
"""gladiator-2000...","[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,2000-05-04


In [141]:

# compute derivative-oïde of last measures
# idea is to get a value representing the amount of changes 
# TODO: Find a way of managing when you have rathing history = 5 and nb measures = 2
# TODO: Change end score (note_var) to smth high so that we bump the movie to get the required minimum amount of history
def sum_changes(x):
    try:
        return abs(x[1] - x[0])
    except IndexError:
        return 0

#op = pl.element().rolling_apply(lambda x: abs(x[1] - x[0]), 2, min_periods=2).sum()
op = pl.element().rolling_apply(sum_changes, rating_history).sum()
df = df.with_columns(pl.col("measures").arr.eval(op, parallel=True).alias("note_change"))
df

id,measures,last_update,release_date,note_change
str,list[f64],datetime[μs],date,list[f64]
"""scream-3""","[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,2000-02-03,[0.01]
"""glass-onion""","[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,2022-11-23,[0.01]
"""x-men""","[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,2016-05-18,[0.0]
"""the-conjuring""","[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,2013-07-18,[0.0]
"""a-bugs-life""","[3.43, 3.43]",2023-04-08 12:30:57.123853,1998-11-25,[null]
"""drive-2011""","[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,2011-09-15,[0.0]
"""where-the-craw...","[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,2022-07-15,[0.0]
"""onward-2020""","[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,2020-02-29,[0.0]
"""fallen-angels""","[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,1995-09-06,[0.0]
"""gladiator-2000...","[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,2000-05-04,[0.0]


In [142]:
# compute days since release
# TODO: make this something so that most recent have a high value?
#       just keep release_date's UNIX timestamp?
now = datetime.now()
df = df.with_columns((now - pl.col("release_date")).dt.days().alias("days_since_release"))
df

id,measures,last_update,release_date,note_change,days_since_release
str,list[f64],datetime[μs],date,list[f64],i64
"""scream-3""","[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,2000-02-03,[0.01],8466
"""glass-onion""","[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,2022-11-23,[0.01],137
"""x-men""","[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,2016-05-18,[0.0],2517
"""the-conjuring""","[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,2013-07-18,[0.0],3552
"""a-bugs-life""","[3.43, 3.43]",2023-04-08 12:30:57.123853,1998-11-25,[null],8901
"""drive-2011""","[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,2011-09-15,[0.0],4224
"""where-the-craw...","[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,2022-07-15,[0.0],268
"""onward-2020""","[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,2020-02-29,[0.0],1135
"""fallen-angels""","[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,1995-09-06,[0.0],10077
"""gladiator-2000...","[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,2000-05-04,[0.0],8375


In [143]:
df = df.with_columns(((now - pl.col("last_update")).dt.days()).alias("days_since_update"))
df

id,measures,last_update,release_date,note_change,days_since_release,days_since_update
str,list[f64],datetime[μs],date,list[f64],i64,i64
"""scream-3""","[2.9, 2.9, ... 2.91]",2023-04-05 01:11:40.775121,2000-02-03,[0.01],8466,4
"""glass-onion""","[3.65, 3.64, ... 3.64]",2023-04-06 12:33:34.206172,2022-11-23,[0.01],137,2
"""x-men""","[3.32, 3.32, 3.32]",2023-04-05 01:11:37.598986,2016-05-18,[0.0],2517,4
"""the-conjuring""","[3.61, 3.61, ... 3.61]",2023-04-05 12:42:05.355808,2013-07-18,[0.0],3552,3
"""a-bugs-life""","[3.43, 3.43]",2023-04-08 12:30:57.123853,1998-11-25,[null],8901,0
"""drive-2011""","[3.95, 3.95, 3.95]",2023-04-05 12:42:06.188627,2011-09-15,[0.0],4224,3
"""where-the-craw...","[3.22, 3.22, ... 3.22]",2023-04-03 01:22:01.280908,2022-07-15,[0.0],268,6
"""onward-2020""","[3.48, 3.48, ... 3.48]",2023-04-02 09:21:47.662612,2020-02-29,[0.0],1135,7
"""fallen-angels""","[4.19, 4.19, ... 4.19]",2023-04-08 01:20:20.371029,1995-09-06,[0.0],10077,1
"""gladiator-2000...","[4.01, 4.01, ... 4.01]",2023-04-07 01:18:27.195159,2000-05-04,[0.0],8375,2


In [144]:
# try things with exp
df = df.with_columns([
    #(1/pl.col("days_since_release")).log().alias("A"), # days since release metric
    #(pl.col("days_since_update").exp().alias("B")), # days since update metric 
    (pl.col("note_change").arr.get(-1).fill_null(0).alias("note_variability"))
    ])
#df.with_columns((pl.col("days_since_release").log()).alias("A2"))



In [145]:
# try norm
# TODO: See EWWA (Exponentially Weighted Moving Average)

df = df.with_columns([
    ( (pl.col("days_since_release") - pl.col("days_since_release").min()) / (pl.col("days_since_release").max() - pl.col("days_since_release").min())).alias("dsr_norm"), # days since release metric
    ( (pl.col("days_since_update") - pl.col("days_since_update").min()) / (pl.col("days_since_update").max() - pl.col("days_since_update").min())).alias("dsu_norm"), # days since update metric
    ( (pl.col("note_variability") - pl.col("note_variability").min()) / (pl.col("note_variability").max() - pl.col("note_variability").min())).alias("note_var_norm"), # days since update metric
    #(pl.col("days_since_update").exp().alias("B")), # days since update metric 
    #(pl.col("note_change").arr.get(-1).alias("C"))
    ])
#df.with_columns((pl.col("days_since_release").log()).alias("A2"))

df = df.with_columns([
    (1 -pl.col("dsr_norm")).abs().alias("inv_dsr_norm")
])

df.sort("days_since_release")


id,measures,last_update,release_date,note_change,days_since_release,days_since_update,note_variability,dsr_norm,dsu_norm,note_var_norm,inv_dsr_norm
str,list[f64],datetime[μs],date,list[f64],i64,i64,f64,f64,f64,f64,f64
"""walle""","[4.15, 4.15, ... 4.15]",2023-03-31 01:27:36.711676,,[0.0],,9,0.0,,0.321429,0.0,
"""swarm-2023-1""","[3.89, 3.87]",2023-04-07 01:18:17.472405,,[null],,2,0.0,,0.071429,0.0,
"""old-2021""","[2.53, 2.53, 2.53]",2023-04-08 01:20:14.222453,,[0.0],,1,0.0,,0.035714,0.0,
"""wandavision""","[3.86, 3.85]",2023-04-06 01:21:46.056577,,[null],,3,0.0,,0.107143,0.0,
"""daisy-jones-th...","[4.42, 4.41, ... 4.27]",2023-04-07 12:32:54.541092,,[0.15],,1,0.15,,0.035714,0.789474,
"""mh370-the-plan...","[2.56, 2.56, 2.56]",2023-04-08 01:20:14.639374,,[0.0],,1,0.0,,0.035714,0.0,
"""squid-game""","[3.81, 3.81, ... 3.81]",2023-04-05 01:11:40.561044,,[0.0],,4,0.0,,0.142857,0.0,
"""past-lives""",[4.21],2023-03-19 19:34:16.982882,2023-06-02,[0.0],-53,20,0.0,0.0,0.714286,0.0,1.0
"""the-little-mer...","[3.67, 3.67, ... 3.67]",2023-04-03 12:34:20.420547,2023-05-24,[0.0],-44,5,0.0,0.000294,0.178571,0.0,0.999706
"""evil-dead-rise...","[3.87, 3.88]",2023-03-30 12:36:47.826482,2023-04-19,[null],-9,9,0.0,0.001438,0.321429,0.0,0.998562


In [146]:
fac_var = 2
fac_dsr = 1
fac_dsu = 2

# TODO: change denom. for null values. Ex: if note_var_norm == null, then only divide by fac_dsr + fac_dsu
df = df.with_columns(
    ((fac_var * pl.col("note_var_norm").fill_null(0) + fac_dsr * pl.col("inv_dsr_norm").fill_null(0) + fac_dsu * pl.col("dsu_norm").fill_null(0))/(fac_var+fac_dsr+fac_dsu)).alias("h")
)

# filter out unreleased movies?
# Not sure, since sometimes unreleased movies already have notes
#df = df.with_columns(
#    [pl.when(pl.col("days_since_release") < 0).then(0).otherwise(pl.col("h")).alias("h")])
# TODO: Check if having a lot of measures can artificially buff heuristic. Idea: Divide total note_change by the number of measures.
df.sort("h")

id,measures,last_update,release_date,note_change,days_since_release,days_since_update,note_variability,dsr_norm,dsu_norm,note_var_norm,inv_dsr_norm,h
str,list[f64],datetime[μs],date,list[f64],i64,i64,f64,f64,f64,f64,f64,f64
"""citizen-kane""","[4.16, 4.16, ... 4.16]",2023-04-08 12:30:48.132582,1941-04-17,[0.0],29942,0,0.0,0.980037,0.0,0.0,0.019963,0.003993
"""old-2021""","[2.53, 2.53, 2.53]",2023-04-08 01:20:14.222453,,[0.0],,1,0.0,,0.035714,0.0,,0.014286
"""mh370-the-plan...","[2.56, 2.56, 2.56]",2023-04-08 01:20:14.639374,,[0.0],,1,0.0,,0.035714,0.0,,0.014286
"""swarm-2023-1""","[3.89, 3.87]",2023-04-07 01:18:17.472405,,[null],,2,0.0,,0.071429,0.0,,0.028571
"""wandavision""","[3.86, 3.85]",2023-04-06 01:21:46.056577,,[null],,3,0.0,,0.107143,0.0,,0.042857
"""squid-game""","[3.81, 3.81, ... 3.81]",2023-04-05 01:11:40.561044,,[0.0],,4,0.0,,0.142857,0.0,,0.057143
"""its-a-wonderfu...","[4.3, 4.3, 4.3]",2023-04-04 12:33:57.875130,1946-12-20,[0.0],27869,4,0.0,0.912305,0.142857,0.0,0.087695,0.074682
"""persona""","[4.36, 4.36, 4.36]",2023-04-08 01:20:10.587273,1966-10-18,[0.0],20627,1,0.0,0.675685,0.035714,0.0,0.324315,0.079149
"""rosemarys-baby...","[4.15, 4.15, ... 4.15]",2023-04-07 12:32:59.803494,1968-06-12,[0.0],20024,1,0.0,0.655982,0.035714,0.0,0.344018,0.083089
"""the-wizard-of-...","[3.95, 3.95, 3.95]",2023-04-03 01:21:50.586591,1939-08-15,[0.0],30553,6,0.0,1.0,0.214286,0.0,0.0,0.085714


In [147]:
import plotly.express as px
import plotly.figure_factory as ff
# only get movieid and heuristic
result = df.select(pl.col("h"))
fig = px.histogram(result.to_pandas())
print(result.describe())
fig.show()



shape: (7, 2)
┌────────────┬──────────┐
│ describe   ┆ h        │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 818.0    │
│ null_count ┆ 0.0      │
│ mean       ┆ 0.266485 │
│ std        ┆ 0.096044 │
│ min        ┆ 0.003993 │
│ max        ┆ 0.728016 │
│ median     ┆ 0.243515 │
└────────────┴──────────┘


In [148]:
df

result = df.select([pl.col("id"), pl.col("h")])
result = result.with_columns((pl.col("h") / pl.col("h").sum()).alias("h_prob"))
dicted = result.to_dict()

# transform h to probability
from numpy.random import choice

to_update = list(choice(dicted["id"], p=dicted["h_prob"], size=100, replace=False))
result.filter(pl.col("id").is_in(to_update)).sort("h_prob")
# for mid in to_update:
#     idx = dicted["id"].index(mid)

#     print(f"{mid}: {dicted['h'][idx]}")

id,h,h_prob
str,f64,f64
"""the-sound-of-m...",0.09649,0.000443
"""evil-dead-ii""",0.127839,0.000586
"""casablanca""",0.150692,0.000691
"""hercules-1997""",0.152357,0.000699
"""batman-begins""",0.1571,0.000721
"""corpse-bride""",0.172,0.000789
"""magnolia""",0.172543,0.000792
"""the-usual-susp...",0.176341,0.000809
"""terminator-2-j...",0.180975,0.00083
"""kill-bill-vol-...",0.181692,0.000834


In [149]:
df.filter(pl.col("id").str.contains("mario"))

id,measures,last_update,release_date,note_change,days_since_release,days_since_update,note_variability,dsr_norm,dsu_norm,note_var_norm,inv_dsr_norm,h
str,list[f64],datetime[μs],date,list[f64],i64,i64,f64,f64,f64,f64,f64,f64
"""super-mario-br...","[2.01, 2.01, ... 2.01]",2023-03-29 01:41:36.137643,2023-04-05,[0.0],4,11,0.0,0.001862,0.392857,0.0,0.998138,0.35677
"""the-super-mari...",[3.56],2023-04-07 12:32:49.947955,2023-04-05,[0.0],4,1,0.0,0.001862,0.035714,0.0,0.998138,0.213913
