In [6]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBRFClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from itertools import product

import warnings
warnings.filterwarnings("ignore")

In [2]:
def param_combinations(param_list):
    params, values = tuple(zip(*param_list))
    
    combs = product(*values)
    settings = []
    for c in combs:
        settings.append({})
        for i, p in enumerate(params):
            settings[-1][p] = [c[i]]
            
    return settings
    
    
def test_grid_search(X_train, y_train, X_test, y_test, model, param_combs, cv=3):
    params = param_combinations(param_combs)
    
    print(f"{len(params)} combinations x {cv} = {cv * len(params)}")
    
    rows = []
    for param in tqdm(params):
        
        grid = GridSearchCV(model, param, cv=cv, error_score=0.0, n_jobs=-1)        
        grid.fit(X_train, y_train)
        
        # scoring
        train_score = grid.score(X_train, y_train)
        test_score = grid.score(X_test, y_test)
        
        rows.append({"train_score": train_score, "test_score": test_score, "params": param, "model": grid.best_estimator_})
        
    return pd.DataFrame(rows, columns=["train_score", "test_score", "params", "model"])

# Demonstration
### Load in data

In [3]:
labeled_df = pd.read_csv("../data/user_labeled/final_labeled.csv")
print(labeled_df.label.value_counts())
labeled_df

before    140
after     137
Name: label, dtype: int64


Unnamed: 0,node,label,degree,category_matches_with_source,in_edges,out_edges,shared_neighbors_with_entry_score,centrality,page_rank,adjusted_reciprocity,shortest_path_length_from_entry,shortest_path_length_to_entry,jaccard_similarity,primary_link,similarity_rank
0,Pitch contour,before,0.811917,0.008546,0.478604,0.333313,3.565994e-05,9.155767e-06,4.538325e-07,0.000058,0.008546,0.017093,1.885257e-05,0.008546,1.091970e-02
1,Contrapuntal motion,before,0.799874,0.018602,0.539450,0.260424,3.073561e-04,2.314712e-05,5.486180e-07,0.000022,0.018602,0.018602,2.397493e-04,0.000000,1.833478e-02
2,False relation,before,0.815793,0.008867,0.434498,0.381294,1.562247e-04,1.179012e-05,3.516609e-07,0.000053,0.008867,0.008867,1.260127e-04,0.000000,8.475446e-03
3,Consecutive fifths,before,0.788316,0.006569,0.210218,0.578099,1.626513e-04,1.016740e-05,1.852400e-07,0.000011,0.006569,0.006569,5.940591e-05,0.000000,6.170700e-03
4,Part (music),before,0.812984,0.000000,0.471821,0.341163,1.377333e-04,1.176928e-05,6.551787e-07,0.000037,0.003629,0.003629,9.060537e-05,0.003629,3.244365e-03
5,Common practice period,after,0.778737,0.000000,0.601881,0.176856,1.816769e-04,3.517272e-05,4.421708e-07,0.000021,0.002853,0.002853,1.713103e-04,0.002853,2.543915e-03
6,Monophony,before,0.788887,0.002988,0.576724,0.212163,5.457056e-05,5.152674e-06,4.896373e-07,0.000017,0.002988,0.002988,4.038120e-05,0.000000,2.538365e-03
7,The Well-Tempered Clavier,after,0.816198,0.001520,0.427098,0.389100,5.648655e-05,2.005541e-05,2.275949e-07,0.000037,0.001520,0.001520,3.927449e-05,0.000000,1.148115e-03
8,Post-tonal music theory,after,0.783535,0.000000,0.195884,0.587651,4.425679e-04,1.749719e-05,1.436792e-07,0.000042,0.043530,0.021765,1.498728e-04,0.000000,3.285244e-04
9,Atmosphères,after,0.800124,0.005406,0.259500,0.540624,8.510854e-05,4.176538e-06,8.031867e-08,0.000053,0.010812,0.005406,2.404556e-05,0.000000,3.432770e-03


### Train test split

In [37]:
X = labeled_df.drop([
    "node", 
    "label", 
    "similarity_rank",
    
    # trimming less valuable estimators
#     "shortest_path_length_to_entry",
#     "primary_link",
#     "category_matches_with_source",
#     "shortest_path_length_from_entry",
#     "in_edges",
#     "adjusted_reciprocity",
    
], axis=1)
y = labeled_df.label

cols = X.columns 

# class imbalance
ros = RandomOverSampler()
rus = RandomUnderSampler()
smote = SMOTE()

#################
# UNDERSAMPLING #
#################
# X, y = rus.fit_sample(X, y)

# X = pd.DataFrame(X, columns=cols)
# y = pd.Series(y)

#########
# SMOTE #
#########
# X, y = smote.fit_sample(X, y)
# X = pd.DataFrame(X, columns=cols)
# y = pd.Series(y)

# basic train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

################
# OVERSAMPLING #
################
# X_train, y_train = ros.fit_sample(X_train, y_train)
# X_train = pd.DataFrame(X_train, columns=cols)
# y_train = pd.Series(y_train)

# X_test, y_test = ros.fit_sample(X_test, y_test)
# X_test = pd.DataFrame(X_test, columns=cols)
# y_test = pd.Series(y_test)

#############
# FULL DATA #
#############
# X_train = X
# y_train = y

print("Train Classes:")
print(y_train.value_counts(), "\n")
print("Test Classes:")
print(y_test.value_counts(), "\n")

Train Classes:
before    114
after     107
Name: label, dtype: int64 

Test Classes:
after     30
before    26
Name: label, dtype: int64 



### Test Grid Search

In [38]:
#     "criterion": ["entropy"], # entropy
#     "n_estimators": [100], # 100
#     "max_depth": [2], # 10
#     "min_samples_split": [5], # 5
#     "min_samples_leaf": [10], # 10
#     "max_leaf_nodes": [8], # 8
#     "bootstrap": [True]
        
        
params = [
    ("criterion", ["entropy", "gini"]),
    ("n_estimators", [100, 1000]),
    ("max_depth", [2,5,10]),
    ("min_samples_split", [4,10]),
    ("min_samples_leaf", [10]),
    ("max_leaf_nodes", [4]),
]

model = RandomForestClassifier(random_state=42)

results = test_grid_search(X_train, y_train, X_test, y_test, model=model, param_combs=params)
results = results.sort_values("test_score", ascending=False).reset_index().drop("index", axis=1)
results

24 combinations x 3 = 72


HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




Unnamed: 0,train_score,test_score,params,model
0,0.742081,0.660714,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
1,0.742081,0.660714,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
2,0.742081,0.660714,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
3,0.742081,0.660714,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
4,0.728507,0.642857,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
5,0.728507,0.642857,"{'criterion': ['entropy'], 'n_estimators': [10...","(DecisionTreeClassifier(class_weight=None, cri..."
6,0.719457,0.625,"{'criterion': ['gini'], 'n_estimators': [1000]...","(DecisionTreeClassifier(class_weight=None, cri..."
7,0.719457,0.625,"{'criterion': ['gini'], 'n_estimators': [1000]...","(DecisionTreeClassifier(class_weight=None, cri..."
8,0.719457,0.625,"{'criterion': ['gini'], 'n_estimators': [1000]...","(DecisionTreeClassifier(class_weight=None, cri..."
9,0.696833,0.625,"{'criterion': ['gini'], 'n_estimators': [1000]...","(DecisionTreeClassifier(class_weight=None, cri..."


In [30]:
results.iloc[1].params

{'criterion': ['entropy'],
 'n_estimators': [1000],
 'max_depth': [5],
 'min_samples_split': [10],
 'min_samples_leaf': [10],
 'max_leaf_nodes': [4]}

In [31]:
rf_model = results.iloc[1].model

In [32]:
with open("../models/rf_classifier_v4_732.pkl", "wb") as file:
    pickle.dump(rf_model, file)