In [1]:
from src.utils import load_dataset, get_path
from src.adapter import adapt_to_dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import xgboost as xgb
import collections
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


In [2]:
data_train_exaction = load_dataset(get_path('data', 'npy', 'audio_data_train.npy'))
data_test_exaction = load_dataset(get_path('data', 'npy', 'audio_data_test.npy'))

In [3]:
data_train = pd.read_csv('data/metadata_train_challenge.csv')
data_test = pd.read_csv('data/metadata_private_test.csv')
data_train = adapt_to_dataset(data_train, data_train_exaction)
data_test = adapt_to_dataset(data_test, data_test_exaction)

In [5]:
def parse_audio(x):
    return x.flatten('F')[:x.shape[0]] 
    
X_train = [i.audio_data.features.allfeat for i in data_train.samples]
X_train = [parse_audio(x) for x in X_train]
y_train = [i.assessment_result for i in data_train.samples]

X_test = [i.audio_data.features.allfeat for i in data_test.samples]
X_test = [parse_audio(x) for x in X_test]
y_test = [i.assessment_result for i in data_test.samples]


In [5]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [6]:
# IMBALANCE DATA PROCESSING 

print('Before process: ', collections.Counter(y_train))

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print('After process: ', collections.Counter(y_train_res))


Before process:  Counter({0: 737, 1: 462})
After process:  Counter({0: 737, 1: 737})


In [15]:
# KNN MODEL
grid_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 17, 40],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

model = GridSearchCV(KNeighborsClassifier(), grid_params, cv=5, n_jobs=-1)
model.fit(X_train_res, y_train_res)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9, 11, 15, 17, 40],
                         'weights': ['uniform', 'distance']})

In [25]:
y_predict = model.predict(X_test)
auc_c = roc_auc_score(y_test,model.predict_proba(X_test)[:, 1])
print("KNN Model - ROC AUC:",auc_c)
print("KNN Model - Accuracy:",metrics.accuracy_score(y_test, y_predict))

KNN Model - ROC AUC: 0.5825863386838996
KNN Model - Accuracy: 0.5625


In [None]:
# XGBOOSTING
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

X_train_res, y_train_res = np.array(X_train_res), np.array(y_train_res)
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)
grid_search.fit(X_train_res, y_train_res)
model = grid_search.best_estimator_

In [None]:
y_predict = model.predict(X_test)
auc_c = roc_auc_score(y_test,model.predict_proba(X_test)[:, 1])
print("XGBoosting Model - ROC AUC:",auc_c)
print("XGBoosting Model - Accuracy:",metrics.accuracy_score(y_test, y_predict))

In [7]:
## Random Forest 

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train_res, y_train_res)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [8]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [9]:
model = RandomForestClassifier(n_estimators = 1400, min_samples_split = 2, min_samples_leaf = 1, max_features = 'auto', max_depth = 100, bootstrap = True)
model.fit(X_train_res, y_train_res)

RandomForestClassifier(max_depth=100, n_estimators=1400)

In [10]:
y_predict = model.predict(X_test)
# auc_c = roc_auc_score(y_test,model.predict_proba(X_test)[:, 1])
# print("RandomForest Model - ROC AUC:",auc_c)
# print("RandomForest Model - Accuracy:",metrics.accuracy_score(y_test, y_predict))

In [12]:
###### --------------------------- WRITE
###### --------------------------- SUBMIT - FILE
resutl = pd.read_csv('data/private_test_sample_submissions.csv')
resutl['assessment_result'] = y_predict
resutl.to_csv('results.csv', index = False)