In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import sys
sys.path.insert(0, '../data_prep')
from sqlite_utils import select_statement_to_df, df_to_table, table_to_df
import shap
from tqdm import tqdm

In [17]:
all_features_query = """

WITH BASE_TABLE AS (
    
      SELECT
      
        a.FILM_ID
        ,c.FILM_TITLE
        ,d.FILM_GENRE
        ,e.FILM_RATING
        ,f.FILM_RATING_SCALED
        ,CASE WHEN f.FILM_RATING_SCALED IS NOT NULL THEN 1 ELSE 0 END AS RATED
      
      FROM ALL_FILMS a
      LEFT JOIN CONTENT_TYPE b
      ON a.FILM_ID = b.FILM_ID
      LEFT JOIN FILM_TITLE c
      ON a.FILM_ID = c.FILM_ID
      LEFT JOIN FILM_GENRE d
      ON a.FILM_ID = d.FILM_ID
      LEFT JOIN FILM_LETTERBOXD_STATS e
      ON a.FILM_ID = e.FILM_ID
      LEFT JOIN PERSONAL_RATING f
      ON a.FILM_ID = f.FILM_ID
      
      WHERE CONTENT_TYPE = 'movie'
      
    )
   
    SELECT

      a.FILM_ID
      ,b.FILM_TITLE
      ,c.FILM_WATCH_COUNT
      ,g.TOP_250_POSITION AS FILM_TOP_250
      ,c.FILM_RATING
      ,COALESCE(1.0*c.FILM_LIKES_COUNT/c.FILM_WATCH_COUNT, 0.0) AS LIKES_PER_WATCH
      ,COALESCE(1.0*c.FILM_FAN_COUNT/c.FILM_WATCH_COUNT, 0.0) AS FANS_PER_WATCH
      ,d.FILM_RUNTIME
      ,f.FILM_YEAR
      ,e.ALL_FILM_GENRES
    
    FROM ALL_FEATURE_FILMS a
    LEFT JOIN FILM_TITLE b
    ON a.FILM_ID = b.FILM_ID
    LEFT JOIN FILM_LETTERBOXD_STATS c
    ON a.FILM_ID = c.FILM_ID
    LEFT JOIN FILM_RUNTIME d
    ON a.FILM_ID = d.FILM_ID
    LEFT JOIN FILM_GENRE e
    ON a.FILM_ID = e.FILM_ID
    LEFT JOIN FILM_YEAR f
    ON a.FILM_ID = f.FILM_ID
    LEFT JOIN FILM_LETTERBOXD_TOP_250 g
    ON a.FILM_ID = g.FILM_ID

"""

keyword_query = """

WITH BASE_TABLE AS (
    
    SELECT
    
        a.FILM_ID
        ,c.FILM_TITLE
        ,d.KEYWORD
        ,d.KEYWORD_ID
        ,e.FILM_RATING
        ,f.FILM_RATING_SCALED
        ,CASE WHEN f.FILM_RATING_SCALED IS NOT NULL THEN 1 ELSE 0 END AS RATED
    
    FROM ALL_FEATURE_FILMS a
    LEFT JOIN CONTENT_TYPE b
    ON a.FILM_ID = b.FILM_ID
    LEFT JOIN FILM_TITLE c
    ON a.FILM_ID = c.FILM_ID
    LEFT JOIN FILM_KEYWORDS d
    ON a.FILM_ID = d.FILM_ID
    LEFT JOIN FILM_LETTERBOXD_STATS e
    ON a.FILM_ID = e.FILM_ID
    LEFT JOIN PERSONAL_RATING f
    ON a.FILM_ID = f.FILM_ID
    
    WHERE b.CONTENT_TYPE = 'movie'
    
    )
    
, SCORE_TABLE AS (

    SELECT

    KEYWORD_ID
    ,KEYWORD
    ,AVG(FILM_RATING) AS MEAN_RATING
    ,AVG(FILM_RATING_SCALED) AS MY_MEAN_RATING
    ,AVG(FILM_RATING_SCALED) - AVG(FILM_RATING) AS MY_VARIANCE
    ,((AVG(FILM_RATING_SCALED) - AVG(FILM_RATING)) * ((SUM(RATED)+0.0)/COUNT(*))) AS VARIANCE_SCORE
    ,COUNT(*) AS KEYWORD_COUNT
    ,SUM(RATED) AS MY_RATING_COUNT
    ,(SUM(RATED)+0.0)/COUNT(*) AS SCALER
    
    FROM BASE_TABLE
    
    GROUP BY KEYWORD
    
    HAVING KEYWORD_COUNT >= 30
    AND SCALER >= 0.2
    AND MY_RATING_COUNT >= 3
    --ORDER BY MEAN_RATING DESC
    --ORDER BY KEYWORD_COUNT DESC
    --ORDER BY MY_VARIANCE DESC
    --ORDER BY VARIANCE_SCORE DESC
    --ORDER BY MY_MEAN_RATING DESC
    --ORDER BY VARIANCE_SCORE DESC
)

SELECT
    a.FILM_ID
    ,a.KEYWORD_ID
    ,b.KEYWORD
    
FROM FILM_KEYWORDS a
LEFT JOIN SCORE_TABLE b
ON a.KEYWORD_ID = b.KEYWORD_ID

WHERE b.KEYWORD_ID IS NOT NULL

"""

my_rating_query = """

    SELECT
         FILM_ID
        ,FILM_RATING_SCALED
    FROM PERSONAL_RATING

"""

director_rating_query = """

WITH BASE_TABLE AS (

    SELECT

        a.FILM_ID
        ,d.FILM_TITLE
        ,b.PERSON_ID
        ,e.PERSON_NAME AS DIRECTOR_NAME
        ,CASE WHEN c.FILM_ID IS NULL THEN 0 ELSE 1 END AS WATCHED
        ,f.FILM_RATING_SCALED
        ,CASE WHEN f.FILM_RATING_SCALED IS NULL THEN 0 ELSE 1 END AS RATED

    FROM ALL_FEATURE_FILMS a

    LEFT JOIN FILM_CREW b
    ON a.FILM_ID = b.FILM_ID

    LEFT JOIN WATCHED c
    ON a.FILM_ID = c.FILM_ID

    LEFT JOIN FILM_TITLE d
    ON a.FILM_ID = d.FILM_ID

    LEFT JOIN PERSON_INFO e
    ON b.PERSON_ID = e.PERSON_ID

    LEFT JOIN PERSONAL_RATING f
    ON a.FILM_ID = f.FILM_ID

    WHERE b.JOB = 'Director'
        
    )
      
, DIRECTOR_RATINGS AS (

	SELECT
      
       PERSON_ID
	  ,DIRECTOR_NAME
      ,COUNT(*) AS TOTAL_FILMS
      ,SUM(WATCHED) AS FILMS_WATCHED
      ,AVG(WATCHED) AS PERCENT_WATCHED
      ,AVG(FILM_RATING_SCALED) AS MEAN_RATING
      ,SUM(RATED) AS FILMS_RATED
      ,AVG(RATED) AS PERCENT_RATED
      
    FROM BASE_TABLE
    
    GROUP BY PERSON_ID, DIRECTOR_NAME

    HAVING TOTAL_FILMS >= 3
    AND FILMS_WATCHED > 1
    AND FILMS_RATED > 1
    AND MEAN_RATING NOT NULL
    AND PERCENT_RATED >= .2
    
 	)
, DIRECTOR_WATCH_STATS AS (

	SELECT
      
       PERSON_ID
	  ,DIRECTOR_NAME
      ,COUNT(*) AS TOTAL_FILMS
      ,AVG(WATCHED) AS PERCENT_WATCHED
      
    FROM BASE_TABLE
    
    GROUP BY PERSON_ID, DIRECTOR_NAME
    
 	)

, MEAN_RATING AS ( SELECT AVG(MEAN_RATING)AS MEAN_TOTAL_RATING FROM DIRECTOR_RATINGS )

, FILM_DIRECTOR_LEVEL AS (

	SELECT
		
		 a.FILM_ID
		,a.FILM_TITLE
		,a.PERSON_ID
		,a.DIRECTOR_NAME
		,COALESCE(b.MEAN_RATING, (SELECT 0.8*MEAN_TOTAL_RATING FROM MEAN_RATING)) AS DIRECTOR_MEAN_RATING
		,COALESCE(c.TOTAL_FILMS, 0) AS DIRECTOR_TOTAL_FILMS
		,COALESCE(c.PERCENT_WATCHED, 0) AS DIRECTOR_PERCENT_WATCHED
		
	 FROM BASE_TABLE a
	 LEFT JOIN DIRECTOR_RATINGS b 
	 ON a.PERSON_ID = b.PERSON_ID
	 LEFT JOIN DIRECTOR_WATCH_STATS c
	 ON a.PERSON_ID = c.PERSON_ID
	 
	 )
	 
SELECT
	
	 FILM_ID
	,AVG(DIRECTOR_MEAN_RATING) AS DIRECTOR_MEAN_RATING
	,AVG(DIRECTOR_TOTAL_FILMS) AS DIRECTOR_TOTAL_FILMS
	,AVG(DIRECTOR_PERCENT_WATCHED) AS DIRECTOR_PERCENT_WATCHED

FROM FILM_DIRECTOR_LEVEL

GROUP BY FILM_ID, FILM_TITLE

"""

top_actor_film_level_query = """

WITH BASE_TABLE AS (

    SELECT

        a.FILM_ID
        ,d.FILM_TITLE
        ,b.PERSON_ID
        ,e.PERSON_NAME AS ACTOR_NAME
        ,CASE WHEN c.FILM_ID IS NULL THEN 0 ELSE 1 END AS WATCHED
        ,CASE WHEN f.FILM_RATING_SCALED IS NOT NULL THEN 1 ELSE 0 END AS RATED
        ,f.FILM_RATING_SCALED

    FROM ALL_FEATURE_FILMS a
    
    LEFT JOIN FILM_CAST b
    ON a.FILM_ID = b.FILM_ID
    
    LEFT JOIN WATCHED c
    ON a.FILM_ID = c.FILM_ID
    
    LEFT JOIN FILM_TITLE d
    ON a.FILM_ID = d.FILM_ID

    LEFT JOIN PERSON_INFO e
    ON b.PERSON_ID = e.PERSON_ID

    LEFT JOIN PERSONAL_RATING f
    ON a.FILM_ID = f.FILM_ID

    WHERE e.PERSON_NAME IS NOT NULL
    
    )
    
, ACTOR_TABLE AS (

    SELECT

    PERSON_ID
    ,SUM(WATCHED) AS TOTAL_WATCHED
    
    FROM BASE_TABLE
    
    GROUP BY PERSON_ID
    
    HAVING TOTAL_WATCHED >= 20
)

SELECT
    
     a.FILM_ID
    ,a.PERSON_ID
    ,a.ACTOR_NAME
    ,1 AS ACTOR_IN_FILM
    
FROM BASE_TABLE a
INNER JOIN ACTOR_TABLE b
ON a.PERSON_ID = b.PERSON_ID

"""

def scale_col(df, column, suffix='', a=0, b=1):
    col_min = df[column].min()
    col_max = df[column].max()
    col_range = (col_max - col_min)
    df[column+suffix] = ((df[column] - col_min) / col_range) * (b - a) + a
    return df

def rank_series(series, ascending=False):
    sorted_series = series.sort_values(ascending=ascending)
    ranks = sorted_series.rank(method='dense', ascending=ascending)
    return ranks

In [4]:
def get_valid_cols(film_id, shap_df, min_shap_val=0.001):
    filmid_shap_df = shap_df[shap_df['FILM_ID']==film_id].reset_index(drop=True)
    valid_cols = []
    for col in filmid_shap_df.columns:
        shap_val = filmid_shap_df[col][0]
        if isinstance(shap_val, str):
            col_valid = False
        elif np.isnan(shap_val):
            col_valid = False
        elif abs(shap_val) < min_shap_val:
            col_valid = False
        elif col == 'BASE_VALUE':
            col_valid = True
        else:
            col_valid = True
        if col_valid:
            valid_cols.append(col)
    return valid_cols

def create_dual_df(film_id, pred_df, shap_df, valid_cols):
    film_title = pred_df[pred_df['FILM_ID']==film_id]['FILM_TITLE'].values[0]
    dual_df = pd.concat([pred_df[pred_df['FILM_ID']==film_id], shap_df[shap_df['FILM_ID']==film_id]])
    dual_df['FILM_TITLE'] = dual_df['FILM_TITLE'].fillna(film_title)
    dual_df['ALGO_SCORE'] = dual_df['ALGO_SCORE'].fillna(dual_df['ALGO_SCORE'].max())
    dual_df = dual_df[valid_cols]
    dual_df.insert(2, 'INFO', ['FEATURE_VALUE', 'SHAP_VALUE'])
    dual_df = dual_df.reset_index(drop=True)
    return dual_df

def return_comparison_df(film_ids, min_shap_val=0.001, decimal_places=3):
    pred_df = table_to_df(table_name='FILM_ALGO_SCORE')
    shap_df = table_to_df(table_name='FILM_SHAP_VALUES')
    valid_cols = [get_valid_cols(x, shap_df, min_shap_val=min_shap_val) for x in film_ids]
    valid_cols = list(set([col for valid_col_list in valid_cols for col in valid_col_list]))
    valid_cols = ['FILM_ID', 'FILM_TITLE', 'ALGO_SCORE'] + valid_cols
    valid_cols = [x for x in pred_df.columns if x in valid_cols]
    valid_cols.append('BASE_VALUE')
    all_dfs = []
    for n, film_id in enumerate(film_ids):
        dual_df = create_dual_df(film_id, pred_df, shap_df, valid_cols)
        melted_df = pd.melt(dual_df, id_vars=['FILM_ID', 'FILM_TITLE', 'INFO'])
        pivoted_df = melted_df.drop('FILM_ID', axis=1).pivot(index='variable', columns=['FILM_TITLE', 'INFO'], values='value').reset_index()
        pivoted_df.columns = [' '.join(col) for col in pivoted_df.columns]
        if n > 0:
            pivoted_df = pivoted_df.drop('variable ', axis=1)
        all_dfs.append(pivoted_df)
    comparison_df = pd.concat(all_dfs, axis=1)
    if len(film_ids) > 1:
        comparison_df['VAR'] = comparison_df[comparison_df.columns[4]] - comparison_df[comparison_df.columns[2]]
        comparison_df['ABS_VAR'] = comparison_df['VAR'].abs()
        comparison_df = comparison_df.sort_values('ABS_VAR', ascending=False)
    else:
        comparison_df = comparison_df.sort_values(comparison_df.columns[2], ascending=False)
    return comparison_df.round(decimal_places)

In [5]:
eligible_watchlist_df = select_statement_to_df(all_features_query)
director_rating_df = select_statement_to_df(director_rating_query)
eligible_watchlist_df = eligible_watchlist_df.merge(director_rating_df, how='left', on='FILM_ID')
eligible_watchlist_df = pd.concat([eligible_watchlist_df, eligible_watchlist_df['ALL_FILM_GENRES'].str.get_dummies(sep='/')], axis=1).drop('ALL_FILM_GENRES', axis=1)
keyword_df = select_statement_to_df(keyword_query)
keyword_df['COUNT'] = 1
keyword_df_wide = pd.pivot_table(keyword_df, values='COUNT', index=['FILM_ID'], columns=['KEYWORD']).fillna(0).reset_index()
eligible_watchlist_df = eligible_watchlist_df.merge(keyword_df_wide, how='left', on='FILM_ID')
top_actor_film_level_df = select_statement_to_df(top_actor_film_level_query)
actor_lookup_df = top_actor_film_level_df.groupby(['PERSON_ID', 'ACTOR_NAME']).count().reset_index()
actor_lookup_dict = {id:name for id, name in zip(actor_lookup_df['PERSON_ID'], actor_lookup_df['ACTOR_NAME'])}
top_actor_film_level_df_wide = pd.pivot_table(top_actor_film_level_df, values='ACTOR_IN_FILM', index=['FILM_ID'], columns='PERSON_ID').fillna(0)
top_actor_film_level_df_wide.columns = [actor_lookup_dict.get(x, x) for x in top_actor_film_level_df_wide.columns]
eligible_watchlist_df['FILM_TOP_250'] = eligible_watchlist_df['FILM_TOP_250'].fillna(266)
eligible_watchlist_df.insert(4, 'FILM_IS_TOP_250', np.where(eligible_watchlist_df['FILM_TOP_250']<=250, 1, 0))
eligible_watchlist_df = eligible_watchlist_df.merge(top_actor_film_level_df_wide, how='left', on='FILM_ID').fillna(0)
eligible_watchlist_df['FILM_RATING'] = eligible_watchlist_df['FILM_RATING'].fillna(2.0)
eligible_watchlist_df = eligible_watchlist_df.fillna(0)
my_rating_df = select_statement_to_df(my_rating_query)
rating_features_df = eligible_watchlist_df.merge(my_rating_df, how='left', on='FILM_ID')
rating_features_df['I_VS_LB'] = rating_features_df['FILM_RATING_SCALED'] - rating_features_df['FILM_RATING']
rated_features = rating_features_df[rating_features_df['FILM_RATING_SCALED'].notnull()].reset_index(drop=True)
unrated_features = rating_features_df[rating_features_df['FILM_RATING_SCALED'].isnull()].reset_index(drop=True)
rating_features_df.head()

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_WATCH_COUNT,FILM_TOP_250,FILM_IS_TOP_250,FILM_RATING,LIKES_PER_WATCH,FANS_PER_WATCH,FILM_RUNTIME,FILM_YEAR,...,Idris Elba,Jack Angel,Alan Tudyk,Thomas Rosales Jr.,Fred Tatasciore,Bob Bergen,Mickie McGowan,Sherry Lynn,FILM_RATING_SCALED,I_VS_LB
0,f_01ZLI,Jumanji,939872,266.0,0,3.6,0.183011,0.001108,104,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.577434,-1.022566
1,f_01UTC,Fatal Attraction,99624,266.0,0,3.45,0.161015,0.001194,119,1987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.845745,0.395745
2,f_020E4,Punch-Drunk Love,508916,266.0,0,4.0,0.323529,0.016649,96,2002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.067814,-0.932186
3,f_01TBi,Altered States,63680,266.0,0,3.65,0.268844,0.004507,102,1980,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.101064,0.451064
4,f_0t4EQ,Death to 2020,82977,266.0,0,2.7,0.110705,2.4e-05,71,2020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [46]:
try:
    anomaly_df = pd.read_csv('anomaly_df.csv')
except:
    all_rows = []
    for n in tqdm(range(len(rated_features))):
        model_type = 'linear_regression'
        dropped_n = rated_features.iloc[n]
        rated_features_dropped_n = rated_features.drop(n)

        non_features = ['FILM_ID',
                        'FILM_TITLE',
                        'FILM_RUNTIME',
                        'FILM_RATING_SCALED',
                        'I_VS_LB',
                        ]
        model_features = [x for x in unrated_features.columns if x not in non_features]
        delete_cols = []
        for col in model_features:
            if len(rated_features_dropped_n[col].unique()) == 2:
                col_sum = rated_features_dropped_n[col].sum()
                if col_sum < 5:
                    delete_cols.append(col)
        model_features = [x for x in model_features if x not in delete_cols]
        target = ['FILM_RATING_SCALED']
        X_train = rated_features_dropped_n[model_features]
        y_train = rated_features_dropped_n[target]
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        if model_type == 'xgboost':
            model = XGBRegressor()
        elif model_type == 'decision_tree':
            model = DecisionTreeRegressor(min_samples_leaf=3)
        elif model_type == 'linear_regression':
            model = LinearRegression()
        model.fit(X_train, y_train)
        X_pred = pd.DataFrame(rated_features.iloc[n]).T[model_features]
        X_pred = scaler.transform(X_pred)
        pred_df = pd.DataFrame(rated_features.iloc[n]).T
        pred_df['ALGO_SCORE'] = model.predict(X_pred)[0][0]
        row = pred_df[['FILM_ID', 'FILM_TITLE', 'FILM_RATING_SCALED', 'ALGO_SCORE']].values[0]
        all_rows.append(row)
    anomaly_df = pd.DataFrame(all_rows, columns=['FILM_ID', 'FILM_TITLE', 'FILM_RATING_SCALED', 'ALGO_SCORE'])
    anomaly_df.to_csv('anomaly_df.csv', index=False)
anomaly_df

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_RATING_SCALED,ALGO_SCORE
0,f_01ZLI,Jumanji,2.577434,3.498500
1,f_01UTC,Fatal Attraction,3.845745,3.163540
2,f_020E4,Punch-Drunk Love,3.067814,3.741294
3,f_01TBi,Altered States,4.101064,3.966784
4,f_0099A,The Skin I Live In,2.944332,3.948160
...,...,...,...,...
1119,f_02b9Q,Armageddon,2.206140,3.458799
1120,f_0Bz3C,All of Us Strangers,3.122470,3.258082
1121,f_01TPk,The Amityville Horror,3.483010,3.093015
1122,f_00rwW,Tammy and the T-Rex,3.205466,2.934742


In [47]:
anomaly_df = scale_col(anomaly_df, 'ALGO_SCORE', suffix='_SCALED', b=5)
anomaly_df['RATING_VAR'] = anomaly_df['FILM_RATING_SCALED'] - anomaly_df['ALGO_SCORE_SCALED']
anomaly_df['RATING_ABS_VAR'] = abs(anomaly_df['RATING_VAR'])
anomaly_df['FILM_RANK'] = rank_series(anomaly_df['FILM_RATING_SCALED'])
anomaly_df['FILM_RANK_PRED'] = rank_series(anomaly_df['ALGO_SCORE_SCALED'])
anomaly_df['RANK_VAR'] = anomaly_df['FILM_RANK'] - anomaly_df['FILM_RANK_PRED']
anomaly_df['RANK_ABS_VAR'] = abs(anomaly_df['RANK_VAR'])
anomaly_df.sort_values('RANK_ABS_VAR', ascending=False)

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_RATING_SCALED,ALGO_SCORE,ALGO_SCORE_SCALED,RATING_VAR,RATING_ABS_VAR,FILM_RANK,FILM_RANK_PRED,RANK_VAR,RANK_ABS_VAR
1072,f_01woA,Night of the Demon,1.837719,4.072314,3.307129,-1.469409,1.469409,1051.0,147.0,904.0,904.0
10,f_0fBkw,The Nun,4.047872,2.442429,1.968900,2.078972,2.078972,227.0,1013.0,-786.0,786.0
692,f_0iAkw,Boss Level,2.074561,3.840158,3.116515,-1.041954,1.041954,1024.0,241.0,783.0,783.0
30,f_01Wpu,Drop Dead Fred,3.885638,2.395267,1.930177,1.955461,1.955461,288.0,1027.0,-739.0,739.0
407,f_01TAO,Rushmore,2.782389,4.029021,3.271583,-0.489194,0.489194,875.0,158.0,717.0,717.0
...,...,...,...,...,...,...,...,...,...,...,...
151,f_00GZY,Gravity,4.655941,4.432706,3.603031,1.052910,1.052910,69.0,69.0,0.0,0.0
60,f_02awY,Alien,4.969388,5.273851,4.293659,0.675729,0.675729,7.0,7.0,0.0,0.0
62,f_0072s,Star Wars,4.984694,5.426210,4.418754,0.565940,0.565940,4.0,4.0,0.0,0.0
725,f_0m83K,Countdown,1.250000,2.050219,1.646873,-0.396873,0.396873,1092.0,1092.0,0.0,0.0


In [33]:
anomaly_df.sort_values('FILM_RANK_PRED', ascending=True).head(50)#.tail(50)

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_RATING_SCALED,ALGO_SCORE,ALGO_SCORE_SCALED,RATING_VAR,RATING_ABS_VAR,FILM_RANK,FILM_RANK_PRED,RANK_VAR,RANK_ABS_VAR
97,f_04VZ8,Interstellar,3.706311,6.134134,5.0,-1.293689,1.293689,366.0,1.0,365.0,365.0
68,f_029qA,Toy Story,4.537129,5.668133,4.617387,-0.080258,0.080258,93.0,2.0,91.0,91.0
510,f_0fA7G,Dune,4.730198,5.437898,4.428351,0.301847,0.301847,54.0,3.0,51.0,51.0
62,f_0072s,Star Wars,4.984694,5.42621,4.418754,0.56594,0.56594,4.0,4.0,0.0,0.0
51,f_0hTha,Parasite,4.877551,5.405945,4.402115,0.475436,0.475436,25.0,5.0,20.0,20.0
37,f_01JzG,Inglourious Basterds,4.852041,5.360117,4.364488,0.487553,0.487553,30.0,6.0,24.0,24.0
60,f_02awY,Alien,4.969388,5.273851,4.293659,0.675729,0.675729,7.0,7.0,0.0,0.0
112,f_0aPvy,Rogue One: A Star Wars Story,4.892857,5.22888,4.256735,0.636122,0.636122,22.0,8.0,14.0,14.0
107,f_07Piy,Logan,4.517327,5.184029,4.21991,0.297417,0.297417,97.0,9.0,88.0,88.0
66,f_01Y2i,Children of Men,4.685644,5.178983,4.215767,0.469877,0.469877,63.0,10.0,53.0,53.0


In [24]:
anomaly_df

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_RATING_SCALED,ALGO_SCORE,ALGO_SCORE_SCALED,RATING_VAR,RATING_ABS_VAR,FILM_RANK,FILM_RANK_PRED,RANK_VAR,RANK_ABS_VAR
0,f_01ZLI,Jumanji,2.577434,3.498500,2.835995,-0.258561,0.258561,930.0,448.0,482.0,482.0
1,f_01UTC,Fatal Attraction,3.845745,3.163540,2.560973,1.284771,1.284771,303.0,651.0,-348.0,348.0
2,f_020E4,Punch-Drunk Love,3.067814,3.741294,3.035342,0.032471,0.032471,737.0,294.0,443.0,443.0
3,f_01TBi,Altered States,4.101064,3.966784,3.220482,0.880581,0.880581,207.0,190.0,17.0,17.0
4,f_0099A,The Skin I Live In,2.944332,3.948160,3.205191,-0.260859,0.260859,795.0,197.0,598.0,598.0
...,...,...,...,...,...,...,...,...,...,...,...
1119,f_02b9Q,Armageddon,2.206140,3.458799,2.803397,-0.597257,0.597257,1009.0,468.0,541.0,541.0
1120,f_0Bz3C,All of Us Strangers,3.122470,3.258082,2.638597,0.483872,0.483872,710.0,589.0,121.0,121.0
1121,f_01TPk,The Amityville Horror,3.483010,3.093015,2.503068,0.979942,0.979942,503.0,702.0,-199.0,199.0
1122,f_00rwW,Tammy and the T-Rex,3.205466,2.934742,2.373117,0.832349,0.832349,669.0,801.0,-132.0,132.0


In [11]:
pred_df.sort_values(target[0]+'_PRED', ascending=False).head(50)

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_WATCH_COUNT,FILM_TOP_250,FILM_IS_TOP_250,FILM_RATING,LIKES_PER_WATCH,FANS_PER_WATCH,FILM_RUNTIME,FILM_YEAR,...,Alan Tudyk,Thomas Rosales Jr.,Fred Tatasciore,Bob Bergen,Mickie McGowan,Sherry Lynn,FILM_RATING_SCALED,I_VS_LB,I_VS_LB_PRED,FILM_RATING_PRED
211,f_0d6bk,High Life,163609,266.0,0,3.26,0.193339,0.001064,113,2018,...,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0,4.778681
245,f_049UY,Alien: Covenant,384797,266.0,0,2.94,0.14996,0.000554,122,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.997595,4.536975
442,f_04kC2,Carrie,261458,266.0,0,2.48,0.087613,0.000792,100,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.946338,4.112287
386,f_08sXg,Unfriended,280087,266.0,0,2.29,0.091015,0.000471,82,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.933795,3.950815
1535,f_01zCA,The Haunted World of El Superbeasto,9092,266.0,0,2.57,0.141993,0.00088,77,2009,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.923566,4.14238
1178,f_029C2,The Fog,20689,266.0,0,1.64,0.031659,4.8e-05,100,2005,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.915747,3.43858
3656,f_00U5q,Piranha 3D,107964,266.0,0,2.35,0.09556,0.000232,88,2010,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.898659,3.938635
1777,f_0bO28,Blair Witch,137945,266.0,0,2.35,0.093806,0.000536,89,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.890149,3.924885
248,f_0biYY,Valerian and the City of a Thousand Planets,239616,266.0,0,2.62,0.117576,0.000964,136,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.885117,4.117415
348,f_04YaC,Tomorrowland,250288,266.0,0,2.63,0.086628,0.000775,130,2015,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.869888,4.100239


In [None]:
final_df = pd.concat([pred_df, rated_features], axis=0).reset_index(drop=True)
print('Calculating SHAP values...')
if model_type == 'xgboost' or model_type == 'decision_tree':
    explainer = shap.TreeExplainer(model, X_train)
elif model_type == 'linear_regression':
    explainer = shap.LinearExplainer(model, X_train)
# shap_values = explainer.Explainer(X_pred)
shap_values = explainer.shap_values(X_pred)
explainer_df = pd.DataFrame(shap_values, columns=model_features)
explainer_df.insert(0, 'FILM_ID', pred_df['FILM_ID'])

In [303]:
tmp_film_id = 'f_047OE'
tmp = explainer_df[explainer_df['FILM_ID']==tmp_film_id].drop('FILM_ID', axis=1)
tmp2 = tmp.loc[:, (abs(tmp) > 0.01).any(axis=0)].T.reset_index()
tmp2.columns = ['col_name', 'col_shap']
tmp3 = unrated_features[unrated_features['FILM_ID']==tmp_film_id].T.reset_index()
tmp3.columns = ['col_name', 'col_val']
tmp4 = tmp3.merge(tmp2, how='inner', on='col_name')
tmp4.sort_values('col_shap')

Unnamed: 0,col_name,col_val,col_shap
9,horror,0.0,-0.013056
12,car race,0.0,0.011217
8,fantasy,0.0,0.015199
1,FILM_RATING,3.03,0.021662
11,thriller,0.0,0.028075
6,action,1.0,0.02945
2,FANS_PER_WATCH,0.00108,0.038882
7,comedy,0.0,0.050205
5,DIRECTOR_PERCENT_WATCHED,1.0,0.062191
10,science-fiction,1.0,0.118057


In [252]:
pred_df.sort_values('FILM_RATING_PRED')

Unnamed: 0,FILM_ID,FILM_TITLE,FILM_WATCH_COUNT,FILM_TOP_250,FILM_IS_TOP_250,FILM_RATING,LIKES_PER_WATCH,FANS_PER_WATCH,FILM_RUNTIME,FILM_YEAR,...,Jack Angel,Alan Tudyk,Thomas Rosales Jr.,Bob Bergen,Mickie McGowan,Sherry Lynn,FILM_RATING_SCALED,I_VS_LB,I_VS_LB_PRED,FILM_RATING_PRED
6651,f_04rmU,Sunday Lovers,118,266.0,0,0.00,0.084746,0.000000,125,1980,...,0.0,0.0,0.0,0.0,0.0,0.0,,,-1.795272,0.000000
11764,f_00M24,The Treatment,185,266.0,0,0.00,0.070270,0.000000,86,2006,...,0.0,0.0,0.0,0.0,0.0,0.0,,,-1.785493,0.001418
13428,f_00LvO,A Further Gesture,84,266.0,0,0.00,0.071429,0.000000,96,1997,...,0.0,0.0,0.0,0.0,0.0,0.0,,,-1.737286,0.008407
14026,f_04hfY,Two Tickets to Broadway,246,266.0,0,0.00,0.105691,0.000000,106,1951,...,0.0,0.0,0.0,0.0,0.0,0.0,,,-1.731155,0.009296
12230,f_0hhJq,The Brawler,178,266.0,0,0.00,0.073034,0.000000,95,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,,,-1.588463,0.029983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,f_01skk,Inception,3060548,266.0,0,4.19,0.404516,0.017288,148,2010,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.767369,0.979001
27,f_02aGY,Terminator 2: Judgment Day,860837,266.0,0,4.28,0.306057,0.012814,137,1991,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.732608,0.987010
117,f_0b8wK,Blade Runner 2049,1593560,266.0,0,4.13,0.378853,0.025117,164,2017,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.910138,0.991001
104,f_04VZ8,Interstellar,3036075,266.0,0,4.32,0.452177,0.052233,169,2014,...,0.0,0.0,0.0,0.0,0.0,0.0,,,0.771972,0.998516


In [None]:
explainer_df

In [None]:

# import ipdb; ipdb.set_trace()
try:
    ex = explainer.expected_value[0]
except:
    ex = explainer.expected_value
explainer_df.insert(1, 'BASE_VALUE', ex)
explainer_df['PREDICTION'] = explainer_df.sum(axis=1)
explainer_df = explainer_df.merge(pred_df[['FILM_ID', 'ALGO_SCORE']], how='left', on='FILM_ID')
explainer_df['SCALER'] = explainer_df['ALGO_SCORE'] / explainer_df['PREDICTION']
explainer_df = explainer_df.drop('FILM_ID', axis=1).mul(explainer_df['SCALER'], axis=0).drop(['ALGO_SCORE', 'SCALER'], axis=1) 
explainer_df.insert(0, 'FILM_ID', pred_df['FILM_ID'])
explainer_df = explainer_df.loc[:, (explainer_df != 0).any(axis=0)]
print('SHAP values calculated!')

In [235]:
X_train

array([[ 0.93282623,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204],
       [-0.51551018,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204],
       [ 0.17525549,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204],
       ...,
       [-0.61828158,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204],
       [-0.69483497,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204],
       [-0.55662786,  0.07302188, -0.08215985, ..., -0.09325048,
        -0.10319204, -0.10319204]])