In [1]:
import pandas as pd

In [2]:
import os
data_folder = './DATA'

In [3]:
nomination_file_names = [
    'best_picture_academy_awards'
]

In [4]:
import sys
sys.path.append('../../..')
import academy_award_predictor_constants as c

In [5]:
def get_data_df(nomination_file_name):
    filename = "{0}_{1}{2}".format(nomination_file_name, c.NOMINATION_FILE_DATA, c.NOMINATION_FILE_PREFIX)
    return pd.read_csv(os.path.join(data_folder, filename), index_col=0)

In [6]:
def get_predict_df(nomination_file_name):
    filename = "{0}_{1}{2}".format(nomination_file_name, c.NOMINATION_FILE_PREDICT, c.NOMINATION_FILE_PREFIX)
    return pd.read_csv(os.path.join(data_folder, filename), index_col=0)

In [7]:
def get_x_values(df):
    cols = ['Nomination_Film_Title', ]
    if 'Nomination_Is_Winner' in df.columns:
        cols.append('Nomination_Is_Winner')
    return df.drop(columns=cols).values

def get_y_values(df):
    return df['Nomination_Is_Winner'].values

def get_titles(df):
    return df['Nomination_Film_Title'].values

In [8]:
from collections import Counter

def get_y_distribution(y):
    counter = Counter(y)
    print("win: {0}  \tloose: {1} \t{2:.1f}%".format(counter[1], counter[0], counter[1]/(counter[1] + counter[0])*100))

In [9]:
def get_data_values(nomination_file_name):
    df_data = get_data_df(nomination_file_name)
    
    X = get_x_values(df_data)
    Y = get_y_values(df_data)
    print("Distribution of full data:")
    get_y_distribution(Y)
    return X, Y

def get_predict_values(nomination_file_name):
    df_predict = get_predict_df(nomination_file_name)
    
    X_predict = get_x_values(df_predict)
    X_predict_titles = get_titles(df_predict)
    return X_predict, X_predict_titles

In [10]:
from sklearn.model_selection import train_test_split

def get_data_split(nomination_file_name):
    X, Y = get_data_values(nomination_file_name)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print("Distribution of training data")
    get_y_distribution(Y_train)
    print("Distribution of test data")
    get_y_distribution(Y_test)
    return X_train, X_test, Y_train, Y_test

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import timeit

def predict_for_clf(clf_best, x_row):
    prediction = clf_best.predict_proba(x_row.reshape(1, -1))
    return prediction[0][1]

def gridsearch_for_clf(nomination_file_name, clf, params, name):
    print(name)
    X_train, X_test, Y_train, Y_test = get_data_split(nomination_file_name)
    grid_search = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=1, cv=5)

    start_time = timeit.default_timer()

    grid_search.fit(X_train, Y_train)

    print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))

    print(grid_search.best_params_)
    print('The accuracy after grid search is {0:.1f}%'.format(grid_search.best_score_*100))

    clf_best = grid_search.best_estimator_

    accuracy = clf_best.score(X_test, Y_test)
    print('The accuracy on the test set is {0:.1f}%'.format(accuracy*100))
    print("-----------------------------------------")
    X_predict, X_predict_titles = get_predict_values(nomination_file_name)
    print("Predicting the nominees")
    results = [{
        'prediction': predict_for_clf(clf_best, x_row),
        'title': X_predict_titles[idx]
    } for idx, x_row in enumerate(X_predict)]
    for res in sorted(results, key=lambda res: res['prediction'], reverse=True):
        print('{0:.1f}%\t:  {1}'.format(res['prediction'] * 100, res['title']))
    print("====================================================================")

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

classifiers = [{
    'name': 'Multinomial Naive Bayes',
    'clf': MultinomialNB(),
    'params': {
        'fit_prior': (True, False),
        'alpha': (0.5, 1.0, 2.0, 4.0),
    }
},{
    'name': 'Random Forest',
    'clf': RandomForestClassifier(criterion='gini', n_jobs=-1),
    'params': {
        'n_estimators': (100, 200, 300),
        'min_samples_split': (20, 30, 40),
        'max_features': ('sqrt', 'log2', None)
    }
}]

def gridsearch_for_file(nomination_file_name):
    for classifier in classifiers:
        gridsearch_for_clf(nomination_file_name, classifier['clf'], classifier['params'], classifier['name'])

In [13]:
for nomination_file_name in nomination_file_names:
    gridsearch_for_file(nomination_file_name)

Multinomial Naive Bayes


FileNotFoundError: [Errno 2] File b'./DATA\\nomination_info_other_events_from_pga_data.csv' does not exist: b'./DATA\\nomination_info_other_events_from_pga_data.csv'