In [None]:
%load_ext autoreload
%autoreload 2
import sys, warnings, time, numpy, yaml, pandas, logging
sys.path.append("../src/") # go to parent dir
from data_access import get_X, get_y, get_train_test
from models.factory import ModelFactory
warnings.filterwarnings('ignore')
with open('../confs/logs.yaml', 'rt') as f:
    config = yaml.safe_load(f.read())
logging.config.dictConfig(config)

In [None]:
X_train, y_train, X_test, y_test, target = get_train_test(train_size=0.8, random_state=42)
features = ['TEAM_GAME_WON_season_sum', 'TEAM_GAME_WON_season_average', 'TEAM_SHOTS_ON_TARGET_season_average', 'TEAM_SHOTS_ON_TARGET_season_sum', 'TEAM_GAME_LOST_season_sum', 'TEAM_GAME_LOST_season_average', 'TEAM_ATTACKS_season_average', 'TEAM_BALL_POSSESSION_season_average', 'TEAM_SHOTS_TOTAL_season_average', 'TEAM_SHOTS_INSIDEBOX_season_average', 'TEAM_DANGEROUS_ATTACKS_season_sum', 'TEAM_GOALS_season_average', 'TEAM_YELLOWCARDS_5_last_match_std', 'TEAM_GOALS_season_sum', 'TEAM_CORNERS_season_average', 'TEAM_SHOTS_ON_TARGET_5_last_match_std', 'TEAM_YELLOWCARDS_season_std', 'TEAM_ATTACKS_5_last_match_std', 'TEAM_DANGEROUS_ATTACKS_season_average', 'TEAM_SHOTS_OFF_TARGET_season_std']
X_train = X_train[features]
X_test = X_test[features]
train_scores = get_y()
test_data = get_X('test')[features]

In [None]:
def eval_model(model, save_model=False, X=X_test):
    start = time.time()
    model.fit()
    score = model.evaluate(X)
    end = time.time()
    logging.info(f'{model.name}={score} in {numpy.round((end-start), 2)}s')
    if save_model:
        model.save(test_data)
    return {'name': model.name, 'score': score, 'time': numpy.round((end-start), 2)}

In [None]:
%load_ext autoreload
%autoreload 2
def eval_model_for_name(name, X=X_train, y=y_train):
    with open('../confs/models.yaml', 'r') as file:
        configurations = yaml.safe_load(file)
    factory = ModelFactory(configurations, X, y, train_scores)
    model = factory.get_model(name)
    return eval_model(model)

In [None]:
def get_features(name = 'random_forest'):
    with open('../confs/models.yaml', 'r') as file:
        configurations = yaml.safe_load(file)
    factory = ModelFactory(configurations, X_train, y_train, train_scores)
    model = factory.get_model(name)
    eval_model(model)
    print(min(model.model.feature_importances_))
    print(max(model.model.feature_importances_))
    feature_names = [X_train.columns[i] for i in range(X_train.shape[1])]
    df_importances = pandas.DataFrame({'feature': feature_names, 'importance': model.model.feature_importances_})
    df_importances = df_importances.sort_values(by=['importance'], ascending=False)
    # for row in df_importances.iterrows():
    #     logging.info(row)
    return list(df_importances['feature'])

In [22]:
names = ['dummy', 'random_forest']
names= ['xgb_gblinear', 'catboost', 'random_forest', 'gradient_boosting', 'ada_boost', 'extra_trees', 'hist_gradient_boosting']
names= ['keras_relu']
for name in names:
    eval_model_for_name(name)

In [None]:
name = 'random_forest'
with open('../confs/models.yaml', 'r') as file:
    configurations = yaml.safe_load(file)
features = get_features()
lines = []
for i in range(len(features), 1, -1):
    sub_features = features[:i]
    factory = ModelFactory(configurations, X_train[sub_features], y_train, train_scores)
    model = factory.get_model(name)
    model.name = f'{name}_{i}'
    line = eval_model(model,X=X_test[sub_features])
    lines.append(line)
    logging.info(sub_features)
df = pandas.DataFrame(lines)

In [None]:
df[['score']].plot.line()

In [None]:
df.sort_values(by=['score'], ascending=False)

In [None]:
%load_ext autoreload
%autoreload 2
from models.stacking_model import StackingModel
names= ['catboost', 'random_forest', 'gradient_boosting', 'ada_boost', 'extra_trees', 'hist_gradient_boosting']
# names= ['catboost', 'random_forest']
# names= ['random_forest']
with open('../confs/models.yaml', 'r') as file:
    configurations = yaml.safe_load(file)
factory = ModelFactory(configurations, X_train, y_train, train_scores)
params = {'estimators': {name:factory.get_model(name) for name in names}}

model = StackingModel(X_train, y_train, train_scores, params)
line = eval_model(model)
model.name = 'stacking_classifier'
model.save(test_data)

In [None]:
score = model.evaluate(X_test)
print(f'{model.name}={score}')

In [None]:
%load_ext autoreload
%autoreload 2
save_model = True
with open('../confs/models.yaml', 'r') as file:
    configurations = yaml.safe_load(file)
factory = ModelFactory(configurations, X_train, y_train, train_scores)
lines = []
for model in factory.get_models():
    model.fit()
    score = model.evaluate(X_test)
    lines.append(eval_model(model, save_model))
df = pandas.DataFrame(lines)
df = df.sort_values(by=['score'], ascending=False)

In [None]:
df=df.sort_values(by=['score'], ascending=False)
print(df.sort_values(by=['score'], ascending=False))
df.to_csv('../data/result.csv')