In [1]:
import pandas as pd
import numpy as np
import os
import string
import pickle

from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, SCORERS, classification_report
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
# best_model = pickle.load(open("../results/final_rf_model.pickle", 'rb'))

In [3]:
X_test_example = pd.read_csv("../results/example.csv")
X_test_example.columns

Index(['fiscal_year', 'organization_name', 'project_name', 'city', 'province',
       'region', 'community_type', 'grant_or_contribution', 'presenter_type',
       'project_sub_type', 'project_type', 'Cultural Diversity',
       'Young Audience', 'Indigenous Communities',
       'Official Language Minority', 'Rural or Remote Regions', 'Cinema',
       'Circus', 'Comedy', 'Dance', 'Heritage Centre/Site', 'Literature',
       'Media Art', 'Music', 'Other', 'Performance Art', 'Storytelling',
       'Theatre', 'Variety', 'Visual Arts'],
      dtype='object')

In [4]:
# best_model.predict(X_test_example)

In [5]:
train_data = pd.read_csv("../data/processed/heritage_train.csv")
X_train = train_data.drop(columns=["amount_category", "amount_approved", "audiences_none"])
y_train = train_data["amount_category"]

In [6]:
X_train.columns

Index(['fiscal_year', 'organization_name', 'project_name', 'city', 'province',
       'region', 'community_type', 'grant_or_contribution', 'presenter_type',
       'project_sub_type', 'project_type', 'disciplines_music',
       'disciplines_dance', 'disciplines_media_art', 'disciplines_theatre',
       'disciplines_literature', 'disciplines_visual_arts',
       'disciplines_circus', 'disciplines_storytelling', 'disciplines_comedy',
       'disciplines_other', 'disciplines_performance_art',
       'disciplines_variety', 'disciplines_cinema',
       'disciplines_heritage_centre/site', 'audiences_young_audience',
       'audiences_official_language_minority', 'audiences_cultural_diversity',
       'audiences_indigenous_communities',
       'audiences_rural_or_remote_regions'],
      dtype='object')

In [7]:
test_data = pd.read_csv("../data/processed/heritage_test.csv")
X_test = test_data.drop(columns=["amount_category", "amount_approved", "audiences_none"])
y_test = test_data["amount_category"]

In [8]:
# Selecting Feature Categories

drop_feature = [
    "fiscal_year",
    "region",
    "organization_name"
]  # droping region as provice is already an indicator of region
text_countvec = "project_name"
categorical_ohe = ["city", "province", "project_type"]
binary = [
    "grant_or_contribution",
    "presenter_type",
    "project_sub_type",
    "audiences_cultural_diversity",
    "audiences_young_audience",
    "audiences_indigenous_communities",
    "audiences_official_language_minority",
    "audiences_rural_or_remote_regions",
    "disciplines_cinema",
    "disciplines_circus",
    "disciplines_comedy",
    "disciplines_dance",
    "disciplines_heritage_centre/site",
    "disciplines_literature",
    "disciplines_visual_arts",
    "disciplines_media_art",
    "disciplines_music",
    "disciplines_other",
    "disciplines_performance_art",
    "disciplines_storytelling",
    "disciplines_theatre",
    "disciplines_variety"
]

ordinal = ["community_type"]
Community_order = [["Remote", "Rural", "Small Urban", "Medium Urban", "Large Urban"]]

In [9]:
# Setting Column Transformers

preprocessor = make_column_transformer(
    (CountVectorizer(max_features=800, stop_words="english"), text_countvec),
    (
        OneHotEncoder(
            handle_unknown="ignore",
        ),
        categorical_ohe,
    ),
    (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), binary),
    (
        OrdinalEncoder(
            categories=Community_order,
        ),
        ordinal,
    ),
    ("drop", drop_feature),
)

In [10]:
best_model = make_pipeline(preprocessor, RandomForestClassifier(max_features='auto', max_depth=30, class_weight='balanced'))

In [11]:
best_model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('countvectorizer',
                                                  CountVectorizer(max_features=800,
                                                                  stop_words='english'),
                                                  'project_name'),
                                                 ('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['city', 'province',
                                                   'project_type']),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(drop='if_binary',
                                                                handle_unknown='ignore'),
                                                  ['grant_or_contributio...
                                     

In [12]:
y_predict = best_model.predict(X_test)
model_quality = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True)).T

In [13]:
model_quality

Unnamed: 0,precision,recall,f1-score,support
$12.0K,0.666667,0.6,0.631579,70.0
$23.0K,0.621212,0.650794,0.635659,63.0
$50.0K,0.868852,0.815385,0.84127,65.0
$8.0K,0.647059,0.804878,0.717391,41.0
less than $8.0K,0.806452,0.757576,0.78125,33.0
accuracy,0.713235,0.713235,0.713235,0.713235
macro avg,0.722048,0.725726,0.72143,272.0
weighted avg,0.718459,0.713235,0.713727,272.0
