In [1]:
import pandas as pd
import numpy as np
import os
import string
import pickle

from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, SCORERS
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Read X_train
read_path = "../data/processed/heritage_train.csv"
train_data = pd.read_csv(read_path)

# Separate X and y
X_train = train_data.drop(
    columns=["amount_category", "amount_approved", "audiences_none"]
)
y_train = train_data["amount_category"]

# Selecting Feature Categories

drop_feature = [
    "fiscal_year",
    "region",
    "disciplines_other",
    "organization_name",
]  # droping region as provice is already an indicator of region

text_countvec = "project_name"
categorical_ohe = ["city", "province", "project_type"]
ordinal = ["community_type"]
Community_order = [["Remote", "Rural", "Small Urban", "Medium Urban", "Large Urban"]]

binary = list(
    set(X_train.columns.tolist())
    - set(drop_feature)
    - set([text_countvec])
    - set(categorical_ohe)
    - set(ordinal)
)

In [3]:
# Setting Column Transformers

preprocessor = make_column_transformer(
    (CountVectorizer(max_features=400, stop_words="english"), text_countvec),
    (
        OneHotEncoder(
            handle_unknown="ignore",
        ),
        categorical_ohe,
    ),
    (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), binary),
    (
        OrdinalEncoder(
            categories=Community_order,
        ),
        ordinal,
    ),
)


In [4]:
# Scoring methods to be used for multilabel classification

scoring = ['f1_weighted', 'recall_weighted', 'precision_weighted']


# Main function

models = {
    "Dummy Classifier": DummyClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Multinomial Naive Bayes": MultinomialNB(),
    "SVC": SVC(class_weight="balanced"),
    "Random Forest": RandomForestClassifier(class_weight="balanced")
}

results = pd.DataFrame()
for name, classifier in models.items():
    pipe = make_pipeline(preprocessor, classifier)
    result = pd.DataFrame(cross_validate(pipe, X_train, y_train, cv=5, scoring=scoring)).mean()
    results = pd.concat([results, pd.DataFrame(result, columns=[name])], axis=1)
    

file_path = "../results/model_comparison.csv"    
try:
    results.to_csv(file_path, encoding='utf-8')
except:
    os.makedirs(os.path.dirname(file_path))
    open(file_path, "wb").write(results.content)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
param_distributions = {
   "columntransformer__countvectorizer__max_features": [400, 500, 600, 700, 800],
    "randomforestclassifier__max_depth": [None, 10, 20, 30, 40, 50, 60],
    "randomforestclassifier__max_features": ["auto", "log2"],
    "randomforestclassifier__class_weight": ["balanced", None]
}


best_model = RandomizedSearchCV(
    make_pipeline(preprocessor, RandomForestClassifier()),
    param_distributions=param_distributions,
    n_jobs=-1,
    scoring=scoring,
    n_iter=20,
    cv=5,
    refit="f1_weighted"
)

best_model.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('countvectorizer',
                                                                               CountVectorizer(max_features=400,
                                                                                               stop_words='english'),
                                                                               'project_name'),
                                                                              ('onehotencoder-1',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['city',
                                                                                'province',
                                                                                'proje

In [6]:
best_model.best_score_

0.64843013063191

In [7]:
best_model.best_params_

{'randomforestclassifier__max_features': 'log2',
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__class_weight': 'balanced',
 'columntransformer__countvectorizer__max_features': 500}

In [8]:
import pickle
pickle.dump(best_model, open("../results/final_rf_model.pickle", "wb"))