<a href="https://colab.research.google.com/github/EmperoR1127/ml_project/blob/emperor/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Env setup

### Import packages

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from scipy.io import arff
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Project root directory
PROJECT_ROOT_DIR = "/content/drive/My Drive/"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "Images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

### Load the dataset

In [0]:
#load the dataset
path = PROJECT_ROOT_DIR + "Data/H-1B_Disclosure_RAW_Data.csv"
df = pd.read_csv(path, encoding='utf-8')
processed_data = df.copy()

  interactivity=interactivity, compiler=compiler, result=result)


#Feature Engineering

###Drop correlated columns and create new columns

In [0]:
processed_data = processed_data.drop(["CASE_NUMBER", "VISA_CLASS", 
                                        "EMPLOYER_NAME", "EMPLOYER_STATE","EMPLOYER_POSTAL_CODE", 
                                        "EMPLOYER_CITY", "EMPLOYER_BUSINESS_DBA", 
                                        "EMPLOYER_COUNTRY", "EMPLOYER_PROVINCE", "EMPLOYER_ADDRESS", 
                                        "EMPLOYER_PHONE", "EMPLOYER_PHONE_EXT", 
                                        "AGENT_ATTORNEY_NAME", "AGENT_ATTORNEY_CITY", "AGENT_ATTORNEY_STATE",
                                        "JOB_TITLE", "SOC_NAME",
                                        "PW_SOURCE", "PW_SOURCE_YEAR", "PW_SOURCE_OTHER", "WAGE_RATE_OF_PAY_FROM",
                                        "WAGE_RATE_OF_PAY_TO", "WAGE_UNIT_OF_PAY",
                                        "WORKSITE_CITY", "WORKSITE_COUNTY", "WORKSITE_POSTAL_CODE", 
                                        "ORIGINAL_CERT_DATE", "PUBLIC_DISCLOSURE_LOCATION"], axis=1)
#format EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['CASE_SUBMITTED'] = pd.to_datetime(processed_data['CASE_SUBMITTED'],infer_datetime_format=True,errors='coerce')
processed_data['DECISION_DATE'] = pd.to_datetime(processed_data['DECISION_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_START_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_START_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_END_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_END_DATE'],infer_datetime_format=True,errors='coerce')
#drop NaT rows because we can't "guess" the specific date
processed_data = processed_data[processed_data.CASE_SUBMITTED != 'NaT']
processed_data = processed_data[processed_data.DECISION_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_START_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_END_DATE != 'NaT']
#add one column as EMP_PERIOD, and drop EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['EMP_PERIOD'] = processed_data['EMPLOYMENT_END_DATE'] - processed_data['EMPLOYMENT_START_DATE']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD']/np.timedelta64(1,'Y')
#train_set = train_set[train_set.EMP_PERIOD != '-']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD'].astype(float)
#add one column as PROCESS_TIME, indicating processing time of visa application
processed_data['PROCESS_TIME'] = processed_data['DECISION_DATE'] - processed_data['CASE_SUBMITTED']
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].map(lambda x: str(x)[:1])
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].astype(float)
processed_data = processed_data.drop(["EMPLOYMENT_START_DATE", "EMPLOYMENT_END_DATE"], axis=1)
processed_data = processed_data.drop(["CASE_SUBMITTED", "DECISION_DATE"], axis=1)

#concatenate the first 2 digit of column SOC_CODE and NAIC_CODE
processed_data['SOC_CODE'] = processed_data['SOC_CODE'].map(lambda x: str(x)[:2])
processed_data['NAICS_CODE'] = processed_data['NAICS_CODE'].map(lambda x: str(x)[:2])
#remove impurity in the column
processed_data = processed_data[processed_data.PW_UNIT_OF_PAY != 'N']
processed_data = processed_data[processed_data.PREVAILING_WAGE != 'N']
#according to google, there are 2080 working hours per year
pw_unit_column = {"Year":1, "Hour":2080, "Month":12, "Week":52, "Bi-Weekly":26}
processed_data['PW_UNIT_OF_PAY'] = processed_data['PW_UNIT_OF_PAY'].replace(pw_unit_column)
#remove ',' in the column value
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('str')
processed_data['PREVAILING_WAGE'] = processed_data.PREVAILING_WAGE.str.replace(',','')
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('float')
#add one column as ANNUAL_SALARY
processed_data['ANNUAL_SALARY'] = processed_data['PREVAILING_WAGE'] * processed_data['PW_UNIT_OF_PAY']
processed_data = processed_data.drop(["PREVAILING_WAGE", "PW_UNIT_OF_PAY"], axis=1)


In [0]:
processed_data.shape

(20327, 21)

### Deal with noise, missing values, numerical and categorical data

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
original_set = processed_data.drop(["CASE_STATUS"], axis=1)
original_labels = processed_data[["CASE_STATUS"]].copy()
original_set_num = original_set.drop(["AGENT_REPRESENTING_EMPLOYER", "SOC_CODE", "NAICS_CODE",
                                "FULL_TIME_POSITION", "PW_WAGE_LEVEL", "H1B_DEPENDENT", "WILLFUL_VIOLATOR",
                                "SUPPORT_H1B", "LABOR_CON_AGREE", "WORKSITE_STATE"], axis=1)
original_set_cat = original_set.drop(["TOTAL_WORKERS","NEW_EMPLOYMENT","CONTINUED_EMPLOYMENT",
                                "CHANGE_PREVIOUS_EMPLOYMENT", "NEW_CONCURRENT_EMP", "CHANGE_EMPLOYER",
                                "AMENDED_PETITION", "EMP_PERIOD", "PROCESS_TIME",
                                "ANNUAL_SALARY"], axis=1)
#build the pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler()),])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")),('cat', OneHotEncoder()),])
full_pipeline = ColumnTransformer([("num", num_pipeline, list(original_set_num)),("cat", cat_pipeline, list(original_set_cat)),])

#prepare the data
original_set = full_pipeline.fit_transform(original_set)

#prepare the target
encoder = LabelEncoder()
original_labels = encoder.fit_transform(original_labels)

  y = column_or_1d(y, warn=True)


### Feature selection

Boruta feature selection method

In [0]:
!pip install Boruta

In [0]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
#Boruta feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
feat_selector.fit(original_set.toarray(), original_labels)
boruta_set = feat_selector.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(boruta_set.shape[1])
      + " features after applying Boruta feature selection technique")

Dataset with 122 features is reduced to 5 features after applying Boruta feature selection technique


L1-based and tree-based feature selection method

In [0]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
#L1-based feature selection
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter = 2000).fit(original_set, original_labels)
l_model = SelectFromModel(lsvc, prefit=True)
l1_set = l_model.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(l1_set.shape[1])
      + " features after applying L1-based feature selection technique")

#tree-based feature selection
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(original_set, original_labels)
tb_model = SelectFromModel(clf, prefit=True)
tr_set = tb_model.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(tr_set.shape[1])
      + " features after applying tree-based feature selection technique")



Dataset with 122 features is reduced to 8 features after applying L1-based feature selection technique
Dataset with 122 features is reduced to 14 features after applying tree-based feature selection technique


### Deal with class imbalance

In [0]:
import joblib
from collections import Counter
#load the datasets
original_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_set.gz')
boruta_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'boruta_set.gz')
l1_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'l1_set.gz')
tr_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'tr_set.gz')
#load the labels
original_labels = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_labels.gz')

Rebalance dataset with oversampling technique

In [0]:
from imblearn.over_sampling import RandomOverSampler

#rebalance the dataset using oversampling (random oversampling)
ros = RandomOverSampler(random_state=42)
ros_boruta_set, ros_boruta_labels = ros.fit_resample(boruta_set, original_labels)
print("Class distribution of oversampling with train_set_boruta " + str(sorted(Counter(ros_boruta_labels).items())))

ros_l1_set, ros_l1_labels = ros.fit_resample(l1_set, original_labels)
print("Class distribution of oversampling with train_set_l1 " + str(sorted(Counter(ros_l1_labels).items())))

ros_tr_set, ros_tr_labels = ros.fit_resample(tr_set, original_labels)
print("Class distribution of oversampling with train_set_tr " + str(sorted(Counter(ros_tr_labels).items())))

Class distribution of oversampling with train_set_boruta [(0, 20058), (1, 20058)]
Class distribution of oversampling with train_set_l1 [(0, 20058), (1, 20058)]
Class distribution of oversampling with train_set_tr [(0, 20058), (1, 20058)]


Rebalance dataset with under-sampling technique

In [0]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#rebalance the dataset using undersampling (nearest neightbours)
renn = RepeatedEditedNearestNeighbours()
renn_boruta_set, renn_boruta_labels = renn.fit_resample(boruta_set, original_labels)
print("Class distribution of undersampling with boruta_set " + str(sorted(Counter(renn_boruta_labels).items())))

renn_l1_set, renn_l1_labels = renn.fit_resample(l1_set, original_labels)
print("Class distribution of undersampling with l1_set " + str(sorted(Counter(renn_l1_labels).items())))

renn_tr_set, renn_tr_labels = renn.fit_resample(tr_set, original_labels)
print("Class distribution of undersampling with tr_set " + str(sorted(Counter(renn_tr_labels).items())))

Class distribution of undersampling with boruta_set [(0, 19484), (1, 269)]
Class distribution of undersampling with l1_set [(0, 19548), (1, 269)]
Class distribution of undersampling with tr_set [(0, 19606), (1, 269)]


Rebalance dataset with balanced sampling technique

In [0]:
from imblearn.combine import SMOTEENN
#rebalance the dataset using balanced sampling (SMOTEENN)
smote_enn = SMOTEENN(random_state=0)
smote_boruta_set, smote_boruta_labels = smote_enn.fit_resample(boruta_set, original_labels)
print("Class distribution of balanced sampling with boruta_set " + str(sorted(Counter(smote_boruta_labels).items())))

smote_l1_set, smote_l1_labels = smote_enn.fit_resample(l1_set, original_labels)
print("Class distribution of balanced sampling with l1_set " + str(sorted(Counter(smote_l1_labels).items())))

smote_tr_set, smote_tr_labels = smote_enn.fit_resample(tr_set, original_labels)
print("Class distribution of balanced sampling with tr_set " + str(sorted(Counter(smote_tr_labels).items())))

Class distribution of balanced sampling with boruta_set [(0, 16097), (1, 15546)]
Class distribution of balanced sampling with l1_set [(0, 16067), (1, 16140)]
Class distribution of balanced sampling with tr_set [(0, 19055), (1, 19077)]


### Split train and test set

In [0]:
#split the dataset into train and test set
from sklearn.model_selection import train_test_split

#1. split original_set and original_labels
original_set_train, original_set_test, original_labels_train, original_labels_test = train_test_split(original_set,original_labels, test_size=0.2, random_state=42)

#2. split ros_boruta_set and ros_boruta_labels
ros_boruta_set_train, ros_boruta_set_test, ros_boruta_labels_train, ros_boruta_labels_test = train_test_split(ros_boruta_set,ros_boruta_labels, test_size=0.2, random_state=42)
#3. split ros_l1_set and ros_l1_labels
ros_l1_set_train, ros_l1_set_test, ros_l1_labels_train, ros_l1_labels_test = train_test_split(ros_l1_set,ros_l1_labels, test_size=0.2, random_state=42)
#4. split ros_tr_set and ros_tr_labels
ros_tr_set_train, ros_tr_set_test, ros_tr_labels_train, ros_tr_labels_test = train_test_split(ros_tr_set,ros_tr_labels, test_size=0.2, random_state=42)

#5. split renn_boruta_set and renn_boruta_labels
renn_boruta_set_train, renn_boruta_set_test, renn_boruta_labels_train, renn_boruta_labels_test = train_test_split(renn_boruta_set,renn_boruta_labels, test_size=0.2, random_state=42)
#6. split renn_l1_set and renn_l1_labels
renn_l1_set_train, renn_l1_set_test, renn_l1_labels_train, renn_l1_labels_test = train_test_split(renn_l1_set,renn_l1_labels, test_size=0.2, random_state=42)
#7. split renn_tr_set and renn_tr_labels
renn_tr_set_train, renn_tr_set_test, renn_tr_labels_train, renn_tr_labels_test = train_test_split(renn_tr_set,renn_tr_labels, test_size=0.2, random_state=42)

#8. split smote_boruta_set and smote_boruta_labels
smote_boruta_set_train, smote_boruta_set_test, smote_boruta_labels_train, smote_boruta_labels_test = train_test_split(smote_boruta_set,smote_boruta_labels, test_size=0.2, random_state=42)
#9. split smote_l1_set and smote_l1_labels
smote_l1_set_train, smote_l1_set_test, smote_l1_labels_train, smote_l1_labels_test = train_test_split(smote_l1_set,smote_l1_labels, test_size=0.2, random_state=42)
#10. split smote_tr_set and smote_tr_labels
smote_tr_set_train, smote_tr_set_test, smote_tr_labels_train, smote_tr_labels_test = train_test_split(smote_tr_set,smote_tr_labels, test_size=0.2, random_state=42)


### Dump the dataset

In [0]:
#store original dataset
joblib.dump(original_set_train, PROJECT_ROOT_DIR + 'Data/' + 'original_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(original_set_test, PROJECT_ROOT_DIR + 'Data/' + 'original_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(original_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'original_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(original_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'original_labels_test' + '.gz', compress=('gzip', 3))

#store ros_boruta dataset
joblib.dump(ros_boruta_set_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_set_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store ros_l1 dataset
joblib.dump(ros_l1_set_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_set_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_labels_test' + '.gz', compress=('gzip', 3))

#store ros_tr dataset
joblib.dump(ros_tr_set_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_set_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_labels_test' + '.gz', compress=('gzip', 3))

#store renn_boruta dataset
joblib.dump(renn_boruta_set_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_set_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store renn_l1 dataset
joblib.dump(renn_l1_set_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_set_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_labels_test' + '.gz', compress=('gzip', 3))

#store renn_tr dataset
joblib.dump(renn_tr_set_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_set_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_labels_test' + '.gz', compress=('gzip', 3))

#store smote_boruta dataset
joblib.dump(smote_boruta_set_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_set_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store smote_l1 dataset
joblib.dump(smote_l1_set_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_set_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_labels_test' + '.gz', compress=('gzip', 3))

#store smote_tr dataset
joblib.dump(smote_tr_set_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_set_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_labels_train, PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_labels_test, PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_labels_test' + '.gz', compress=('gzip', 3))

['/content/drive/My Drive/Data/smote_tr_labels_test.gz']

# Train the models

### Load the dataset

In [0]:
import joblib
#load the 10 datasets
#1. original dataset
original_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_set_train' + '.gz')
original_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_set_test' + '.gz')
original_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_labels_train' + '.gz')
original_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_labels_test' + '.gz')

#2. ros_boruta dataset
ros_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_set_train' + '.gz')
ros_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_set_test' + '.gz')
ros_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_labels_train' + '.gz')
ros_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_boruta_labels_test' + '.gz')

#3. ros_l1 dataset
ros_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_set_train' + '.gz')
ros_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_set_test' + '.gz')
ros_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_labels_train' + '.gz')
ros_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_l1_labels_test' + '.gz')

#4. ros_tr dataset
ros_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_set_train' + '.gz')
ros_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_set_test' + '.gz')
ros_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_labels_train' + '.gz')
ros_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'ros_tr_labels_test' + '.gz')

#5. renn_boruta dataset
renn_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_set_train' + '.gz')
renn_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_set_test' + '.gz')
renn_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_labels_train' + '.gz')
renn_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_boruta_labels_test' + '.gz')

#6. renn_l1 dataset
renn_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_set_train' + '.gz')
renn_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_set_test' + '.gz')
renn_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_labels_train' + '.gz')
renn_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_l1_labels_test' + '.gz')

#7. renn_tr dataset
renn_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_set_train' + '.gz')
renn_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_set_test' + '.gz')
renn_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_labels_train' + '.gz')
renn_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'renn_tr_labels_test' + '.gz')

#8. smote_boruta dataset
smote_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_set_train' + '.gz')
smote_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_set_test' + '.gz')
smote_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_labels_train' + '.gz')
smote_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_boruta_labels_test' + '.gz')

#9. smote_l1 dataset
smote_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_set_train' + '.gz')
smote_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_set_test' + '.gz')
smote_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_labels_train' + '.gz')
smote_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_l1_labels_test' + '.gz')

#10. smote_tr dataset
smote_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_set_train' + '.gz')
smote_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_set_test' + '.gz')
smote_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_labels_train' + '.gz')
smote_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'smote_tr_labels_test' + '.gz')

### Tree models

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dt_clf = DecisionTreeClassifier()

In [0]:
#train the model against the original dataset
cross_val_score(rf_clf, original_set_train, original_labels_train, scoring = ftwo_scorer, cv=10)
dt_original_set_prediction = cross_val_predict(dt_clf, original_set_test, original_labels_test, cv=10)
dt_f2_original_set = fbeta_score(original_labels_test, dt_original_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against original dataset is " + str(dt_f2_original_set))

The f2 score of decision tree model trained against original dataset is 0.77953991347736


In [0]:
#train the model against the ros_boruta dataset
cross_val_score(dt_clf, ros_boruta_set_train, ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
dt_ros_boruta_set_prediction = cross_val_predict(dt_clf, ros_boruta_set_test, ros_boruta_labels_test, cv=10)
dt_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, dt_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against ros_boruta dataset is " + str(dt_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(dt_clf, ros_l1_set_train, ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
dt_ros_l1_set_prediction = cross_val_predict(dt_clf, ros_l1_set_test, ros_l1_labels_test, cv=10)
dt_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, dt_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against ros_l1 dataset is " + str(dt_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(dt_clf, ros_tr_set_train, ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
dt_ros_tr_set_prediction = cross_val_predict(dt_clf, ros_tr_set_test, ros_tr_labels_test, cv=10)
dt_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, dt_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against ros_tr dataset is " + str(dt_f2_ros_tr_set))

The f2 score of decision tree model trained against ros_boruta dataset is 0.976179554508792
The f2 score of decision tree model trained against ros_l1 dataset is 0.9815265128347616
The f2 score of decision tree model trained against ros_tr dataset is 0.9838922912787686


In [0]:
#train the model against the renn_boruta dataset
cross_val_score(dt_clf, renn_boruta_set_train, renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
dt_renn_boruta_set_prediction = cross_val_predict(dt_clf, renn_boruta_set_test, renn_boruta_labels_test, cv=10)
dt_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, dt_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against renn_boruta dataset is " + str(dt_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(dt_clf, renn_l1_set_train, renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
dt_renn_l1_set_prediction = cross_val_predict(dt_clf, renn_l1_set_test, renn_l1_labels_test, cv=10)
dt_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, dt_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against renn_l1 dataset is " + str(dt_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(dt_clf, renn_tr_set_train, renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
dt_renn_tr_set_prediction = cross_val_predict(dt_clf, renn_tr_set_test, renn_tr_labels_test, cv=10)
dt_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, dt_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against renn_tr dataset is " + str(dt_f2_renn_tr_set))

The f2 score of decision tree model trained against renn_boruta dataset is 0.8441970686534626
The f2 score of decision tree model trained against renn_l1 dataset is 0.7922872018632714
The f2 score of decision tree model trained against renn_tr dataset is 0.8383439490445859


In [0]:
#train the model against the smote_boruta dataset
cross_val_score(dt_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
dt_smote_boruta_set_prediction = cross_val_predict(dt_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
dt_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, dt_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against smote_boruta dataset is " + str(dt_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(dt_clf, smote_l1_set_train, smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
dt_smote_l1_set_prediction = cross_val_predict(dt_clf, smote_l1_set_test, smote_l1_labels_test, cv=10)
dt_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, dt_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against smote_l1 dataset is " + str(dt_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(dt_clf, smote_tr_set_train, smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
dt_smote_tr_set_prediction = cross_val_predict(dt_clf, smote_tr_set_test, smote_tr_labels_test, cv=10)
dt_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, dt_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of decision tree model trained against smote_tr dataset is " + str(dt_f2_smote_tr_set))

The f2 score of decision tree model trained against smote_boruta dataset is 0.986408562401845
The f2 score of decision tree model trained against smote_l1 dataset is 0.978409041272007
The f2 score of decision tree model trained against smote_tr dataset is 0.9824316909102746


### Linear models

In [0]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
le_clf = LinearSVC(random_state=0, tol=1e-5)

In [0]:
#train the model against the original dataset
cross_val_score(le_clf, original_set_train, original_labels_train, scoring = ftwo_scorer, cv=10)
le_original_set_prediction = cross_val_predict(le_clf, original_set_test, original_labels_test, cv=10)
le_f2_original_set = fbeta_score(original_labels_test, le_original_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against original dataset is " + str(le_f2_original_set))



The f2 score of linear model trained against original dataset is 0.6581709251324499


In [0]:
#train the model against the ros_boruta dataset
cross_val_score(le_clf, ros_boruta_set_train, ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
le_ros_boruta_set_prediction = cross_val_predict(le_clf, ros_boruta_set_test, ros_boruta_labels_test, cv=10)
le_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, le_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against ros_boruta dataset is " + str(le_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(le_clf, ros_l1_set_train, ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
le_ros_l1_set_prediction = cross_val_predict(le_clf, ros_l1_set_test, ros_l1_labels_test, cv=10)
le_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, le_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against ros_l1 dataset is " + str(le_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(le_clf, ros_tr_set_train, ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
le_ros_tr_set_prediction = cross_val_predict(le_clf, ros_tr_set_test, ros_tr_labels_test, cv=10)
le_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, le_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against ros_tr dataset is " + str(le_f2_ros_tr_set))



The f2 score of linear model trained against ros_boruta dataset is 0.852887885442009




The f2 score of linear model trained against ros_l1 dataset is 0.8272712983803792




The f2 score of linear model trained against ros_tr dataset is 0.8090063433867388


In [0]:
#train the model against the renn_boruta dataset
cross_val_score(le_clf, renn_boruta_set_train, renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
le_renn_boruta_set_prediction = cross_val_predict(le_clf, renn_boruta_set_test, renn_boruta_labels_test, cv=10)
le_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, le_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against renn_boruta dataset is " + str(le_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(le_clf, renn_l1_set_train, renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
le_renn_l1_set_prediction = cross_val_predict(le_clf, renn_l1_set_test, renn_l1_labels_test, cv=10)
le_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, le_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against renn_l1 dataset is " + str(le_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(le_clf, renn_tr_set_train, renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
le_renn_tr_set_prediction = cross_val_predict(le_clf, renn_tr_set_test, renn_tr_labels_test, cv=10)
le_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, le_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against renn_tr dataset is " + str(le_f2_renn_tr_set))

The f2 score of linear model trained against renn_boruta dataset is 0.7033056967231599
The f2 score of linear model trained against renn_l1 dataset is 0.7206764198087077




The f2 score of linear model trained against renn_tr dataset is 0.6703171257965779


In [0]:
#train the model against the smote_boruta dataset
cross_val_score(le_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
le_smote_boruta_set_prediction = cross_val_predict(le_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
le_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, le_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against smote_boruta dataset is " + str(le_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(le_clf, smote_l1_set_train, smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
le_smote_l1_set_prediction = cross_val_predict(le_clf, smote_l1_set_test, smote_l1_labels_test, cv=10)
le_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, le_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against smote_l1 dataset is " + str(le_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(le_clf, smote_tr_set_train, smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
le_smote_tr_set_prediction = cross_val_predict(le_clf, smote_tr_set_test, smote_tr_labels_test, cv=10)
le_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, le_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of linear model trained against smote_tr dataset is " + str(le_f2_smote_tr_set))



The f2 score of linear model trained against smote_boruta dataset is 0.9221918540715213




The f2 score of linear model trained against smote_l1 dataset is 0.8811728554591265




The f2 score of linear model trained against smote_tr dataset is 0.8312017755773644


### Distance-based models

In [0]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
n_neighbors = 3
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')

In [0]:
#train the model against the original dataset
cross_val_score(knn_clf, original_set_train, original_labels_train, scoring = ftwo_scorer, cv=10)
knn_original_set_prediction = cross_val_predict(knn_clf, original_set_test, original_labels_test, cv=10)
knn_f2_original_set = fbeta_score(original_labels_test, knn_original_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against original dataset is " + str(knn_f2_original_set))

The f2 score of k-nearest neighbors model trained against original dataset is 0.6809456129280018


In [0]:
#train the model against the ros_boruta dataset
cross_val_score(knn_clf, ros_boruta_set_train, ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
knn_ros_boruta_set_prediction = cross_val_predict(knn_clf, ros_boruta_set_test, ros_boruta_labels_test, cv=10)
knn_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, knn_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against ros_boruta dataset is " + str(knn_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(knn_clf, ros_l1_set_train, ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
knn_ros_l1_set_prediction = cross_val_predict(knn_clf, ros_l1_set_test, ros_l1_labels_test, cv=10)
knn_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, knn_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against ros_l1 dataset is " + str(knn_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(knn_clf, ros_tr_set_train, ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
knn_ros_tr_set_prediction = cross_val_predict(knn_clf, ros_tr_set_test, ros_tr_labels_test, cv=10)
knn_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, knn_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against ros_tr dataset is " + str(knn_f2_ros_tr_set))

The f2 score of k-nearest neighbors model trained against ros_boruta dataset is 0.968956446243028
The f2 score of k-nearest neighbors model trained against ros_l1 dataset is 0.9710471952548965
The f2 score of k-nearest neighbors model trained against ros_tr dataset is 0.9810320847578311


In [0]:
#train the model against the renn_boruta dataset
cross_val_score(knn_clf, renn_boruta_set_train, renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
knn_renn_boruta_set_prediction = cross_val_predict(knn_clf, renn_boruta_set_test, renn_boruta_labels_test, cv=10)
knn_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, knn_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against renn_boruta dataset is " + str(knn_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(knn_clf, renn_l1_set_train, renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
knn_renn_l1_set_prediction = cross_val_predict(knn_clf, renn_l1_set_test, renn_l1_labels_test, cv=10)
knn_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, knn_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against renn_l1 dataset is " + str(knn_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(knn_clf, renn_tr_set_train, renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
knn_renn_tr_set_prediction = cross_val_predict(knn_clf, renn_tr_set_test, renn_tr_labels_test, cv=10)
knn_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, knn_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against renn_tr dataset is " + str(knn_f2_renn_tr_set))

The f2 score of k-nearest neighbors model trained against renn_boruta dataset is 0.8485381511736529
The f2 score of k-nearest neighbors model trained against renn_l1 dataset is 0.8104529917739529
The f2 score of k-nearest neighbors model trained against renn_tr dataset is 0.7969813017289943


In [0]:
#train the model against the smote_boruta dataset
cross_val_score(knn_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
knn_smote_boruta_set_prediction = cross_val_predict(knn_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
knn_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, knn_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against smote_boruta dataset is " + str(knn_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(knn_clf, smote_l1_set_train, smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
knn_smote_l1_set_prediction = cross_val_predict(knn_clf, smote_l1_set_test, smote_l1_labels_test, cv=10)
knn_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, knn_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against smote_l1 dataset is " + str(knn_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(knn_clf, smote_tr_set_train, smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
knn_smote_tr_set_prediction = cross_val_predict(knn_clf, smote_tr_set_test, smote_tr_labels_test, cv=10)
knn_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, knn_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of k-nearest neighbors model trained against smote_tr dataset is " + str(knn_f2_smote_tr_set))

The f2 score of k-nearest neighbors model trained against smote_boruta dataset is 0.9895301834139694
The f2 score of k-nearest neighbors model trained against smote_l1 dataset is 0.9818161987013381
The f2 score of k-nearest neighbors model trained against smote_tr dataset is 0.9825614432862246


In [0]:
import datetime
import tracemalloc
print("Training begin at: " + str(datetime.datetime.now()))
tracemalloc.start()
cross_val_score(knn_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
snapshot = tracemalloc.take_snapshot()
print("Training end at: " + str(datetime.datetime.now()))
print("Predicting begin at: " + str(datetime.datetime.now()))
knn_smote_boruta_set_prediction = cross_val_predict(knn_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
print("Predicting end at: " + str(datetime.datetime.now()))

top_stats = snapshot.statistics('lineno')
total_size = 0
for stat in top_stats:
    total_size = total_size + stat.size
print("Training memory consumption is " + str(total_size / (1024 * 1024)) + "MB")

Training begin at: 2019-11-20 22:32:54.077642
Training end at: 2019-11-20 22:33:08.427120
Predicting begin at: 2019-11-20 22:33:08.427740
Predicting end at: 2019-11-20 22:33:09.497922
Training memory consumption is 120.6889295578003MB


### Probabilistic models

In [0]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
gnb_clf = GaussianNB()

In [0]:
#train the model against the original dataset
cross_val_score(gnb_clf, original_set_train.toarray(), original_labels_train, scoring = ftwo_scorer, cv=10)
gnb_original_set_prediction = cross_val_predict(gnb_clf, original_set_test.toarray(), original_labels_test, cv=10)
gnb_f2_original_set = fbeta_score(original_labels_test, gnb_original_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against original dataset is " + str(gnb_f2_original_set))

The f2 score of Naive Bayes model trained against original dataset is 0.23766671705907808


In [0]:
#train the model against the ros_boruta dataset
cross_val_score(gnb_clf, ros_boruta_set_train.toarray(), ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
gnb_ros_boruta_set_prediction = cross_val_predict(gnb_clf, ros_boruta_set_test.toarray(), ros_boruta_labels_test, cv=10)
gnb_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, gnb_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against ros_boruta dataset is " + str(gnb_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(gnb_clf, ros_l1_set_train.toarray(), ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
gnb_ros_l1_set_prediction = cross_val_predict(gnb_clf, ros_l1_set_test.toarray(), ros_l1_labels_test, cv=10)
gnb_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, gnb_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against ros_l1 dataset is " + str(gnb_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(gnb_clf, ros_tr_set_train.toarray(), ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
gnb_ros_tr_set_prediction = cross_val_predict(gnb_clf, ros_tr_set_test.toarray(), ros_tr_labels_test, cv=10)
gnb_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, gnb_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against ros_tr dataset is " + str(gnb_f2_ros_tr_set))

The f2 score of Naive Bayes model trained against ros_boruta dataset is 0.7014121119024469
The f2 score of Naive Bayes model trained against ros_l1 dataset is 0.7865482815380613
The f2 score of Naive Bayes model trained against ros_tr dataset is 0.6317521904610978


In [0]:
#train the model against the renn_boruta dataset
cross_val_score(gnb_clf, renn_boruta_set_train.toarray(), renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
gnb_renn_boruta_set_prediction = cross_val_predict(gnb_clf, renn_boruta_set_test.toarray(), renn_boruta_labels_test, cv=10)
gnb_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, gnb_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against renn_boruta dataset is " + str(gnb_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(gnb_clf, renn_l1_set_train.toarray(), renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
gnb_renn_l1_set_prediction = cross_val_predict(gnb_clf, renn_l1_set_test.toarray(), renn_l1_labels_test, cv=10)
gnb_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, gnb_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against renn_l1 dataset is " + str(gnb_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(gnb_clf, renn_tr_set_train.toarray(), renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
gnb_renn_tr_set_prediction = cross_val_predict(gnb_clf, renn_tr_set_test.toarray(), renn_tr_labels_test, cv=10)
gnb_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, gnb_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against renn_tr dataset is " + str(gnb_f2_renn_tr_set))

The f2 score of Naive Bayes model trained against renn_boruta dataset is 0.8060243041711006
The f2 score of Naive Bayes model trained against renn_l1 dataset is 0.5667649710554388
The f2 score of Naive Bayes model trained against renn_tr dataset is 0.4467616678051182


In [0]:
#train the model against the smote_boruta dataset
cross_val_score(gnb_clf, smote_boruta_set_train.toarray(), smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
gnb_smote_boruta_set_prediction = cross_val_predict(gnb_clf, smote_boruta_set_test.toarray(), smote_boruta_labels_test, cv=10)
gnb_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, gnb_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against smote_boruta dataset is " + str(gnb_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(gnb_clf, smote_l1_set_train.toarray(), smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
gnb_smote_l1_set_prediction = cross_val_predict(gnb_clf, smote_l1_set_test.toarray(), smote_l1_labels_test, cv=10)
gnb_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, gnb_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against smote_l1 dataset is " + str(gnb_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(gnb_clf, smote_tr_set_train.toarray(), smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
gnb_smote_tr_set_prediction = cross_val_predict(gnb_clf, smote_tr_set_test.toarray(), smote_tr_labels_test, cv=10)
gnb_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, gnb_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of Naive Bayes model trained against smote_tr dataset is " + str(gnb_f2_smote_tr_set))

The f2 score of Naive Bayes model trained against smote_boruta dataset is 0.8020823538407438
The f2 score of Naive Bayes model trained against smote_l1 dataset is 0.8609279533087664
The f2 score of Naive Bayes model trained against smote_tr dataset is 0.7185329679524726


### Ensemble models

####Bagging

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [0]:
#train the model against the original dataset
cross_val_score(rf_clf, original_set_train, original_labels_train, scoring = ftwo_scorer, cv=10)
rf_original_set_prediction = cross_val_predict(rf_clf, original_set_test, original_labels_test, cv=10)
rf_f2_original_set = fbeta_score(original_labels_test, rf_original_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against original dataset is " + str(rf_f2_original_set))

The f2 score of randon forest model trained against original dataset is 0.7599427953440065


In [0]:
#train the model against the ros_boruta dataset
cross_val_score(rf_clf, ros_boruta_set_train, ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
rf_ros_boruta_set_prediction = cross_val_predict(rf_clf, ros_boruta_set_test, ros_boruta_labels_test, cv=10)
rf_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, rf_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against ros_boruta dataset is " + str(rf_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(rf_clf, ros_l1_set_train, ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
rf_ros_l1_set_prediction = cross_val_predict(rf_clf, ros_l1_set_test, ros_l1_labels_test, cv=10)
rf_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, rf_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against ros_l1 dataset is " + str(rf_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(rf_clf, ros_tr_set_train, ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
rf_ros_tr_set_prediction = cross_val_predict(rf_clf, ros_tr_set_test, ros_tr_labels_test, cv=10)
rf_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, rf_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against ros_tr dataset is " + str(rf_f2_ros_tr_set))

The f2 score of randon forest model trained against ros_boruta dataset is 0.976179554508792
The f2 score of randon forest model trained against ros_l1 dataset is 0.9798965787424994
The f2 score of randon forest model trained against ros_tr dataset is 0.9912748662831834


In [0]:
#train the model against the renn_boruta dataset
cross_val_score(rf_clf, renn_boruta_set_train, renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
rf_renn_boruta_set_prediction = cross_val_predict(rf_clf, renn_boruta_set_test, renn_boruta_labels_test, cv=10)
rf_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, rf_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against renn_boruta dataset is " + str(rf_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(rf_clf, renn_l1_set_train, renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
rf_renn_l1_set_prediction = cross_val_predict(rf_clf, renn_l1_set_test, renn_l1_labels_test, cv=10)
rf_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, rf_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against renn_l1 dataset is " + str(rf_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(rf_clf, renn_tr_set_train, renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
rf_renn_tr_set_prediction = cross_val_predict(rf_clf, renn_tr_set_test, renn_tr_labels_test, cv=10)
rf_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, rf_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against renn_tr dataset is " + str(rf_f2_renn_tr_set))

The f2 score of randon forest model trained against renn_boruta dataset is 0.8421131792491612
The f2 score of randon forest model trained against renn_l1 dataset is 0.8192335206949412
The f2 score of randon forest model trained against renn_tr dataset is 0.8369671432678313


In [0]:
#train the model against the smote_boruta dataset
cross_val_score(rf_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
rf_smote_boruta_set_prediction = cross_val_predict(rf_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
rf_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, rf_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against smote_boruta dataset is " + str(rf_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(rf_clf, smote_l1_set_train, smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
rf_smote_l1_set_prediction = cross_val_predict(rf_clf, smote_l1_set_test, smote_l1_labels_test, cv=10)
rf_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, rf_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against smote_l1 dataset is " + str(rf_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(rf_clf, smote_tr_set_train, smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
rf_smote_tr_set_prediction = cross_val_predict(rf_clf, smote_tr_set_test, smote_tr_labels_test, cv=10)
rf_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, rf_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of randon forest model trained against smote_tr dataset is " + str(rf_f2_smote_tr_set))

The f2 score of randon forest model trained against smote_boruta dataset is 0.9890960355516127
The f2 score of randon forest model trained against smote_l1 dataset is 0.985866061012983
The f2 score of randon forest model trained against smote_tr dataset is 0.9925261713409137


#### Boosting

In [0]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ad_clf = AdaBoostClassifier(n_estimators=100)

In [9]:
#train the model against the original dataset
cross_val_score(ad_clf, original_set_train, original_labels_train, scoring = ftwo_scorer, cv=10)
ad_original_set_prediction = cross_val_predict(ad_clf, original_set_test, original_labels_test, cv=10)
ad_f2_original_set = fbeta_score(original_labels_test, ad_original_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against original dataset is " + str(ad_f2_original_set))

The f2 score of adaboost model trained against original dataset is 0.7943356991094435


In [10]:
#train the model against the ros_boruta dataset
cross_val_score(ad_clf, ros_boruta_set_train, ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
ad_ros_boruta_set_prediction = cross_val_predict(ad_clf, ros_boruta_set_test, ros_boruta_labels_test, cv=10)
ad_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, ad_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against ros_boruta dataset is " + str(ad_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(ad_clf, ros_l1_set_train, ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
ad_ros_l1_set_prediction = cross_val_predict(ad_clf, ros_l1_set_test, ros_l1_labels_test, cv=10)
ad_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, ad_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against ros_l1 dataset is " + str(ad_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(ad_clf, ros_tr_set_train, ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
ad_ros_tr_set_prediction = cross_val_predict(ad_clf, ros_tr_set_test, ros_tr_labels_test, cv=10)
ad_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, ad_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against ros_tr dataset is " + str(ad_f2_ros_tr_set))

The f2 score of adaboost model trained against ros_boruta dataset is 0.8964252476083332
The f2 score of adaboost model trained against ros_l1 dataset is 0.8854252756493926
The f2 score of adaboost model trained against ros_tr dataset is 0.9003622193190122


In [11]:
#train the model against the renn_boruta dataset
cross_val_score(ad_clf, renn_boruta_set_train, renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
ad_renn_boruta_set_prediction = cross_val_predict(ad_clf, renn_boruta_set_test, renn_boruta_labels_test, cv=10)
ad_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, ad_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against renn_boruta dataset is " + str(ad_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(ad_clf, renn_l1_set_train, renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
ad_renn_l1_set_prediction = cross_val_predict(ad_clf, renn_l1_set_test, renn_l1_labels_test, cv=10)
ad_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, ad_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against renn_l1 dataset is " + str(ad_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(ad_clf, renn_tr_set_train, renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
ad_renn_tr_set_prediction = cross_val_predict(ad_clf, renn_tr_set_test, renn_tr_labels_test, cv=10)
ad_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, ad_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against renn_tr dataset is " + str(ad_f2_renn_tr_set))

The f2 score of adaboost model trained against renn_boruta dataset is 0.8289773823343272
The f2 score of adaboost model trained against renn_l1 dataset is 0.8279442911068181
The f2 score of adaboost model trained against renn_tr dataset is 0.830957032364469


In [12]:
#train the model against the smote_boruta dataset
cross_val_score(ad_clf, smote_boruta_set_train, smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
ad_smote_boruta_set_prediction = cross_val_predict(ad_clf, smote_boruta_set_test, smote_boruta_labels_test, cv=10)
ad_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, ad_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against smote_boruta dataset is " + str(ad_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(ad_clf, smote_l1_set_train, smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
ad_smote_l1_set_prediction = cross_val_predict(ad_clf, smote_l1_set_test, smote_l1_labels_test, cv=10)
ad_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, ad_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against smote_l1 dataset is " + str(ad_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(ad_clf, smote_tr_set_train, smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
ad_smote_tr_set_prediction = cross_val_predict(ad_clf, smote_tr_set_test, smote_tr_labels_test, cv=10)
ad_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, ad_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of adaboost model trained against smote_tr dataset is " + str(ad_f2_smote_tr_set))

The f2 score of adaboost model trained against smote_boruta dataset is 0.9733680367125892
The f2 score of adaboost model trained against smote_l1 dataset is 0.9617912011900163
The f2 score of adaboost model trained against smote_tr dataset is 0.9689880444294872


#### Hybrid

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dt_clf = DecisionTreeClassifier()
n_neighbors = 3
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
gnb_clf = GaussianNB()
vt_clf = VotingClassifier(estimators=[('dt', dt_clf), ('knn', knn_clf), ('gnb', gnb_clf)], voting='hard')

In [22]:
#train the model against the original dataset
cross_val_score(vt_clf, original_set_train.toarray(), original_labels_train, scoring = ftwo_scorer, cv=10)
vt_original_set_prediction = cross_val_predict(vt_clf, original_set_test.toarray(), original_labels_test, cv=10)
vt_f2_original_set = fbeta_score(original_labels_test, vt_original_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against original dataset is " + str(vt_f2_original_set))

The f2 score of hard voting model trained against original dataset is 0.703536664503569


In [24]:
#train the model against the ros_boruta dataset
cross_val_score(vt_clf, ros_boruta_set_train.toarray(), ros_boruta_labels_train, scoring = ftwo_scorer, cv=10)
vt_ros_boruta_set_prediction = cross_val_predict(vt_clf, ros_boruta_set_test.toarray(), ros_boruta_labels_test, cv=10)
vt_f2_ros_boruta_set = fbeta_score(ros_boruta_labels_test, vt_ros_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against ros_boruta dataset is " + str(vt_f2_ros_boruta_set))

#train the model against the ros_l1 dataset
cross_val_score(vt_clf, ros_l1_set_train.toarray(), ros_l1_labels_train, scoring = ftwo_scorer, cv=10)
vt_ros_l1_set_prediction = cross_val_predict(vt_clf, ros_l1_set_test.toarray(), ros_l1_labels_test, cv=10)
vt_f2_ros_l1_set = fbeta_score(ros_l1_labels_test, vt_ros_l1_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against ros_l1 dataset is " + str(vt_f2_ros_l1_set))

#train the model against the ros_tr dataset
cross_val_score(ad_clf, ros_tr_set_train.toarray(), ros_tr_labels_train, scoring = ftwo_scorer, cv=10)
ad_ros_tr_set_prediction = cross_val_predict(ad_clf, ros_tr_set_test.toarray(), ros_tr_labels_test, cv=10)
ad_f2_ros_tr_set = fbeta_score(ros_tr_labels_test, ad_ros_tr_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against ros_tr dataset is " + str(ad_f2_ros_tr_set))

The f2 score of hard voting model trained against ros_boruta dataset is 0.9781882483259291
The f2 score of hard voting model trained against ros_l1 dataset is 0.9822946761265869
The f2 score of hard voting model trained against ros_tr dataset is 0.9003622193190122


In [25]:
#train the model against the renn_boruta dataset
cross_val_score(vt_clf, renn_boruta_set_train.toarray(), renn_boruta_labels_train, scoring = ftwo_scorer, cv=10)
vt_renn_boruta_set_prediction = cross_val_predict(vt_clf, renn_boruta_set_test.toarray(), renn_boruta_labels_test, cv=10)
vt_f2_renn_boruta_set = fbeta_score(renn_boruta_labels_test, vt_renn_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against renn_boruta dataset is " + str(vt_f2_renn_boruta_set))

#train the model against the renn_l1 dataset
cross_val_score(vt_clf, renn_l1_set_train.toarray(), renn_l1_labels_train, scoring = ftwo_scorer, cv=10)
vt_renn_l1_set_prediction = cross_val_predict(vt_clf, renn_l1_set_test.toarray(), renn_l1_labels_test, cv=10)
vt_f2_renn_l1_set = fbeta_score(renn_l1_labels_test, vt_renn_l1_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against renn_l1 dataset is " + str(vt_f2_renn_l1_set))

#train the model against the renn_tr dataset
cross_val_score(vt_clf, renn_tr_set_train.toarray(), renn_tr_labels_train, scoring = ftwo_scorer, cv=10)
vt_renn_tr_set_prediction = cross_val_predict(vt_clf, renn_tr_set_test.toarray(), renn_tr_labels_test, cv=10)
vt_f2_renn_tr_set = fbeta_score(renn_tr_labels_test, vt_renn_tr_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against renn_tr dataset is " + str(vt_f2_renn_tr_set))

The f2 score of hard voting model trained against renn_boruta dataset is 0.8498979427127291
The f2 score of hard voting model trained against renn_l1 dataset is 0.7986750930652609
The f2 score of hard voting model trained against renn_tr dataset is 0.8354611330013411


In [26]:
#train the model against the smote_boruta dataset
cross_val_score(vt_clf, smote_boruta_set_train.toarray(), smote_boruta_labels_train, scoring = ftwo_scorer, cv=10)
vt_smote_boruta_set_prediction = cross_val_predict(vt_clf, smote_boruta_set_test.toarray(), smote_boruta_labels_test, cv=10)
vt_f2_smote_boruta_set = fbeta_score(smote_boruta_labels_test, vt_smote_boruta_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against smote_boruta dataset is " + str(vt_f2_smote_boruta_set))

#train the model against the smote_l1 dataset
cross_val_score(vt_clf, smote_l1_set_train.toarray(), smote_l1_labels_train, scoring = ftwo_scorer, cv=10)
vt_smote_l1_set_prediction = cross_val_predict(vt_clf, smote_l1_set_test.toarray(), smote_l1_labels_test, cv=10)
vt_f2_smote_l1_set = fbeta_score(smote_l1_labels_test, vt_smote_l1_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against smote_l1 dataset is " + str(vt_f2_smote_l1_set))

#train the model against the smote_tr dataset
cross_val_score(vt_clf, smote_tr_set_train.toarray(), smote_tr_labels_train, scoring = ftwo_scorer, cv=10)
vt_smote_tr_set_prediction = cross_val_predict(vt_clf, smote_tr_set_test.toarray(), smote_tr_labels_test, cv=10)
vt_f2_smote_tr_set = fbeta_score(smote_tr_labels_test, vt_smote_tr_set_prediction, average='macro', beta=2)
print("The f2 score of hard voting model trained against smote_tr dataset is " + str(vt_f2_smote_tr_set))

The f2 score of hard voting model trained against smote_boruta dataset is 0.9886774955091553
The f2 score of hard voting model trained against smote_l1 dataset is 0.9786848461628974
The f2 score of hard voting model trained against smote_tr dataset is 0.9822599570672526
