# Path setup & import packages

In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from core.ws_regression import *
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle
import copy
import scipy.stats as ss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
seed = 42
feature_drop_cols = ['vote_average', 'imdb_rating', 'tomato_user_rating', 'mv_lens_avg_rating',
                     'movieId','tmdbId']
label_feature = 'vote_average'

In [2]:
# read data
df = pd.read_csv(os.path.join(root_path, 'data',
                              'imdb-tmdb', 'processed_mvlens_tmdb_imdb_tomato.csv')).fillna(0)

X, Y = df.drop(feature_drop_cols, axis=1), df[label_feature]

# split data
train_fraction = 0.75
# # split data
indices_train, indices_test = train_test_split(list(range(len(Y))), train_size=train_fraction, random_state=seed)
X_train = X.iloc[indices_train]
Y_train = Y[indices_train]
X_test = X.iloc[indices_test]
Y_test = Y[indices_test]

# model declaration and fit
model = Pipeline([
    ("std", StandardScaler()),
    ("model", LinearRegression())
])
model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)
true_mse = mean_squared_error(Y_test, Y_test_pred)
print('MSE', true_mse)

MSE 1.0035305662338386e+25


In [3]:
residual = np.abs(model.predict(X_train) - Y_train)
residual

1615    0.499997
73      0.033488
3467    0.237081
48      0.115353
932     0.090182
          ...   
1130    0.083996
1294    1.498148
860     0.927764
3507    0.311747
3174    1.583382
Name: vote_average, Length: 2793, dtype: float64

In [4]:
df_result = pd.DataFrame()

for seed in range(10):
    for p in [0.001, 0.005, 0.01, 0.05, 0.1, 0.25]:
        sample_percent = p * 100
        q = 1-p
        bad_indices = residual >= residual.quantile(q)
        X_train_bad = X_train[bad_indices]
        Y_train_bad = Y_train[bad_indices]

        # model declaration and fit
        model = Pipeline([
            ("std", StandardScaler()),
            ("model", GradientBoostingRegressor(n_estimators=250, random_state=seed)),
        ])
        model.fit(X_train_bad, Y_train_bad)

        # result visualization
        Y_test_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_test_pred)
        df_result = df_result.append({
            'seed': seed,
            'sample_percent': sample_percent,
            'mse': mse
        }, ignore_index=True
        )

In [5]:
df_result.groupby('sample_percent').agg(['mean', 'std'])['mse']

Unnamed: 0_level_0,mean,std
sample_percent,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,11.10683,0.270479
0.5,8.628396,0.151197
1.0,3.435194,0.045105
5.0,1.077586,0.009287
10.0,0.611619,0.002301
25.0,0.340487,0.001266


In [6]:
df_result.groupby('sample_percent').agg(['mean', 'std'])['mse'].to_csv('badsubset_extended.csv')