In [1]:
import pandas as pd
import numpy as np
import os
import string
import pickle

from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, SCORERS
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
heritage = pd.read_csv("../data/raw/capf_en.csv", encoding="ISO-8859-1")

In [3]:
# data cleaning
heritage.columns = heritage.columns.str.lower().str.replace(" ", "_")
heritage = heritage.rename(columns={"grant_or_contributionution": "grant_or_contribution"})
#heritage["audiences"] = heritage["audiences"].str.split(", ", expand=False)
#heritage["disciplines"] = heritage["disciplines"].str.split(", ", expand=False)

In [4]:
heritage_train, heritage_test = train_test_split(heritage, test_size=0.20, random_state=1233)

In [5]:
q10 = heritage_train["amount_approved"].quantile(0.1)
q25 = heritage_train["amount_approved"].quantile(0.25)
q50 = heritage_train["amount_approved"].quantile(0.5)
q75 = heritage_train["amount_approved"].quantile(0.75)

labeling = [f"${q75/1000}K",
            f"${q50/1000}K",
            f"${q25/1000}K",
            f"${q10/1000}K",
            f"less than ${q10/1000}K"]

amount = heritage_train["amount_approved"]
heritage_train["greater_than"] = np.where(amount > q75, labeling[0],
                                          np.where(amount > q50, labeling[1],
                                                   np.where(amount > q25, labeling[2],
                                                            np.where(amount > q10, labeling[3], labeling[4]))))

heritage_train["amount_approved"] = heritage_train["greater_than"]
heritage_train.drop("greater_than", axis=1, inplace=True)

In [6]:
#heritage_train = pd.read_csv("..............")

In [7]:
# Separate X and y
X_train, y_train = heritage_train.drop("amount_approved", axis=1), heritage_train["amount_approved"]

In [8]:
X_train["Cultural Diversity"] = X_train["audiences"].apply(lambda x: "Cultural Diversity" in x)
X_train["Young Audience"] = X_train["audiences"].apply(lambda x: "Young Audience" in x)
X_train["Indigenous Communities"] = X_train["audiences"].apply(lambda x: "Indigenous Communities" in x)
X_train["Official Language Minority"] = X_train["audiences"].apply(lambda x: "Official Language Minority" in x)
X_train["Rural or Remote Regions"] = X_train["audiences"].apply(lambda x: "Rural or Remote Regions" in x)
X_train.drop("audiences", axis=1, inplace=True)

X_train["Cinema"] = X_train["disciplines"].apply(lambda x: "Cinema" in x)
X_train["Circus"] = X_train["disciplines"].apply(lambda x: "Circus" in x)
X_train["Comedy"] = X_train["disciplines"].apply(lambda x: "Comedy" in x)
X_train["Dance"] = X_train["disciplines"].apply(lambda x: "Dance" in x)
X_train["Heritage Centre/Site"] = X_train["disciplines"].apply(lambda x: "Heritage Centre/Site" in x)
X_train["Literature"] = X_train["disciplines"].apply(lambda x: "Literature" in x)
X_train["Media Art"] = X_train["disciplines"].apply(lambda x: "Media Art" in x)
X_train["Music"] = X_train["disciplines"].apply(lambda x: "Music" in x)
X_train["Other"] = X_train["disciplines"].apply(lambda x: "Other" in x)
X_train["Performance Art"] = X_train["disciplines"].apply(lambda x: "Performance Art" in x)
X_train["Storytelling"] = X_train["disciplines"].apply(lambda x: "Storytelling" in x)
X_train["Theatre"] = X_train["disciplines"].apply(lambda x: "Theatre" in x)
X_train["Variety"] = X_train["disciplines"].apply(lambda x: "Variety" in x)
X_train["Visual Arts"] = X_train["disciplines"].apply(lambda x: "Visual Arts" in x)
X_train.drop("disciplines", axis=1, inplace=True)

X_train.head()

Unnamed: 0,fiscal_year,organization_name,project_name,city,province,region,community_type,grant_or_contribution,presenter_type,project_sub_type,...,Heritage Centre/Site,Literature,Media Art,Music,Other,Performance Art,Storytelling,Theatre,Variety,Visual Arts
798,2017-2018,Chester Theatre Council Society,Chester Playhouse Presentation Series - Multi ...,Chester,Nova Scotia,Atlantic,Rural,Grant,Multidisciplinary,Programming,...,False,False,True,True,False,False,False,True,False,False
1170,2017-2018,Queer City Cinema Inc.,Performatorium 2017,Regina,Saskatchewan,Prairies,Medium Urban,Contribution,Multidisciplinary,Programming,...,False,False,True,False,False,True,False,False,False,False
907,2017-2018,Fall for Dance North Festival Inc.,Fall for Dance North Festival 2017 - Multi Yea...,Toronto,Ontario,Ontario,Large Urban,Grant,Specialized,Programming,...,False,False,False,False,False,False,False,False,False,False
944,2017-2018,Firehall Theatre Society (The),The Firehall Art Centre's Performing and Visua...,Vancouver,British Columbia,Western,Large Urban,Grant,Multidisciplinary,Programming,...,False,False,False,False,False,False,False,True,False,False
65,2016-2017,Burlington Theatre Board Inc. (The),The Burlington Performing Arts Centre Presents,Burlington,Ontario,Ontario,Medium Urban,Contribution,Multidisciplinary,Programming,...,False,False,False,True,False,False,False,True,False,False


In [9]:






# Selecting Feature Categories

drop_feature = [
    "fiscal_year",
    "region",
    "organization_name"
]  # droping region as provice is already an indicator of region
text_countvec = "project_name"
categorical_ohe = ["city", "province", "project_type"]
binary = [
    "grant_or_contribution",
    "presenter_type",
    "project_sub_type",
    "Cultural Diversity",
    "Young Audience",
    "Indigenous Communities",
    "Official Language Minority",
    "Rural or Remote Regions",
    "Cinema",
    "Circus",
    "Comedy",
    "Dance",
    "Heritage Centre/Site",
    "Literature",
    "Media Art",
    "Music",
    "Other",
    "Performance Art",
    "Storytelling",
    "Theatre",
    "Variety",
    "Visual Arts",
]

ordinal = ["community_type"]
Community_order = [["Remote", "Rural", "Small Urban", "Medium Urban", "Large Urban"]]

In [10]:
# Setting Column Transformers

preprocessor = make_column_transformer(
    (CountVectorizer(max_features=400, stop_words="english"), text_countvec),
    (
        OneHotEncoder(
            handle_unknown="ignore",
        ),
        categorical_ohe,
    ),
    (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), binary),
    (
        OrdinalEncoder(
            categories=Community_order,
        ),
        ordinal,
    ),
    ("drop", drop_feature),
)


In [11]:
# Scoring methods to be used for multilabel classification

scoring = ['f1_weighted', 'recall_weighted', 'precision_weighted']


# Main function

models = {
    "Dummy Classifier": DummyClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Multinomial Naive Bayes": MultinomialNB(),
    "SVC": SVC(class_weight="balanced"),
    "Random Forest": RandomForestClassifier(class_weight="balanced")
}

results = pd.DataFrame()
for name, classifier in models.items():
    pipe = make_pipeline(preprocessor, classifier)
    result = pd.DataFrame(cross_validate(pipe, X_train, y_train, cv=5, scoring=scoring)).mean()
    results = pd.concat([results, pd.DataFrame(result, columns=[name])], axis=1)
    

file_path = "../results/model_comparison.csv"    
try:
    results.to_csv(file_path, index=False, encoding='utf-8')
except:
    os.makedirs(os.path.dirname(file_path))
    open(file_path, "wb").write(results.content)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
param_distributions = {
    "columntransformer__countvectorizer__max_features": [400, 500, 600, 700, 800],
    "randomforestclassifier__max_depth": [None, 10, 20, 30, 40, 50, 60],
    "randomforestclassifier__max_features": ["auto", "log2"],
    "randomforestclassifier__class_weight": ["balanced", None]
}


best_model = RandomizedSearchCV(
    make_pipeline(preprocessor, RandomForestClassifier()),
    param_distributions=param_distributions,
    n_jobs=-1,
    scoring=scoring,
    n_iter=20,
    cv=5,
    refit="f1_weighted"
)

best_model.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('countvectorizer',
                                                                               CountVectorizer(max_features=400,
                                                                                               stop_words='english'),
                                                                               'project_name'),
                                                                              ('onehotencoder-1',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['city',
                                                                                'province',
                                                                                'proje

In [13]:
best_model.best_score_

0.6475991882698122

In [14]:
best_model.best_params_

{'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__max_depth': 30,
 'randomforestclassifier__class_weight': 'balanced',
 'columntransformer__countvectorizer__max_features': 800}

In [15]:
pd.DataFrame(best_model.cv_results_).mean()

  pd.DataFrame(best_model.cv_results_).mean()


mean_fit_time                                               0.588097
std_fit_time                                                0.017678
mean_score_time                                             0.060926
std_score_time                                              0.007990
param_randomforestclassifier__max_depth                    30.000000
param_columntransformer__countvectorizer__max_features    570.000000
split0_test_f1_weighted                                     0.602430
split1_test_f1_weighted                                     0.604031
split2_test_f1_weighted                                     0.614280
split3_test_f1_weighted                                     0.600450
split4_test_f1_weighted                                     0.606253
mean_test_f1_weighted                                       0.605489
std_test_f1_weighted                                        0.019933
rank_test_f1_weighted                                      10.500000
split0_test_recall_weighted       

In [16]:
import pickle
pickle.dump(best_model, open("../results/final_rf_model.pickle", "wb"))

In [21]:
# Exporting one test example for exercise

(pd.DataFrame(X_train.iloc[1]).T).to_csv("../results/example.csv", index=False)