# Path setup & import packages

In [5]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from core.ws_regression import *
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle
import copy
import scipy.stats as ss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
seed = 42
feature_drop_cols = ['vote_average', 'imdbId', 'movieId','tmdbId']
label_feature = 'vote_average'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Fully supervised

In [6]:
# read data
df = pd.read_csv(os.path.join(root_path, 'data',
                              'imdb-tmdb', 'merged_imdb_tmdb_with_additional_features.csv')).fillna(0)

X, Y = df.drop(feature_drop_cols, axis=1), df[label_feature]

# split data
train_fraction = 0.75
# # split data
indices_train, indices_test = train_test_split(list(range(len(Y))), train_size=train_fraction, random_state=seed)
X_train = X.iloc[indices_train]
Y_train = Y[indices_train]
X_test = X.iloc[indices_test]
Y_test = Y[indices_test]

# model declaration and fit
model = Pipeline([
    ("std", StandardScaler()),
    ("model", GradientBoostingRegressor(n_estimators=250, random_state=seed))
])
model.fit(X_train, Y_train)
Y_test_pred = model.predict(X_test)
true_mse = mean_squared_error(Y_test, Y_test_pred)
print('MSE', true_mse)

MSE 0.37324481674300336


# Weak Supervision

In [7]:
# Y_mean = Y.mean() # centering for LF generation
# Y_var = Y.var()

# param_err, mu_err, score, mse = {}, {}, {}, {}

# for m in range(3, 20):
#     n = len(Y)
#     L, true_Sigma = generate_lfs(m, n, Y - Y_mean, Y_var)

#     lm = LabelModel()
#     lm.fit(L, Y_var)
#     lm.inference(L)
#     Y_hat = lm.Y_hat + Y_mean

    
#     Sigma_hat = lm.Sigma_hat
#     param_err[m] = np.linalg.norm(Sigma_hat - true_Sigma) / (m+1)**2
#     mu_err[m]    = np.linalg.norm(Sigma_hat[:m,m] - true_Sigma[:m,m]) / m
#     score[m] = lm.score(Y - Y_mean)
    
    
#     # # split data
#     indices_train, indices_test = train_test_split(list(range(len(Y))), train_size=train_fraction, random_state=seed)
#     X_train = X.iloc[indices_train]
#     Y_train = Y_hat[indices_train]
#     X_test = X.iloc[indices_test]
#     Y_test = Y[indices_test]

#     # model declaration and fit
#     model = Pipeline([
#         ("std", StandardScaler()),
#         ("model", GradientBoostingRegressor(n_estimators=250, random_state=seed))
#     ])
#     model.fit(X_train, Y_train)
#     Y_test_pred = model.predict(X_test)
#     mse[m] = mean_squared_error(Y_test, Y_test_pred)
#     print('numLFs', m, 'MSE', mse[m])

# Fully supervision with small true

In [11]:
df_result = pd.DataFrame()

for seed in range(10):
    for sample_percent in [0.1, 1, 5, 10, 25, 50]:
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=train_fraction, random_state=seed)
        sample_size = int(len(X_train) * sample_percent / 100)
        np.random.seed(seed=seed)
        sample_indices = np.random.choice(list(range(len(X_train))), size=sample_size, replace=False)
        X_train = X_train.iloc[sample_indices]
        Y_train = Y_train.iloc[sample_indices]

        # model declaration and fit
        model = Pipeline([
            ("std", StandardScaler()),
            ("model", GradientBoostingRegressor(n_estimators=250, random_state=seed))
        ])
        model.fit(X_train, Y_train)

        Y_test_pred = model.predict(X_test)
        df_result = df_result.append({
            'seed': seed,
            'sample_percent': sample_percent,
            'n_train': sample_size,
            'mse': mean_squared_error(Y_test, Y_test_pred)
        }, ignore_index=True)

In [13]:
df_result.groupby(['sample_percent', 'n_train']).agg(['mean', 'std'])['mse'].to_csv('smalltrue_baselines.csv')