<a href="https://colab.research.google.com/github/EmperoR1127/ml_project/blob/emperor/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Env setup

### Import packages

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from scipy.io import arff
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Project root directory
PROJECT_ROOT_DIR = "/content/drive/My Drive/"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "Images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

### Load the dataset

In [20]:
#load the dataset
path = PROJECT_ROOT_DIR + "Data/H-1B_Disclosure_RAW_Data.csv"
df = pd.read_csv(path, encoding='utf-8')
processed_data = df.copy()

  interactivity=interactivity, compiler=compiler, result=result)


#Feature Engineering

###Drop correlated columns and create new columns

In [0]:
processed_data = processed_data.drop(["CASE_NUMBER", "VISA_CLASS", 
                                        "EMPLOYER_NAME", "EMPLOYER_STATE","EMPLOYER_POSTAL_CODE", 
                                        "EMPLOYER_CITY", "EMPLOYER_BUSINESS_DBA", 
                                        "EMPLOYER_COUNTRY", "EMPLOYER_PROVINCE", "EMPLOYER_ADDRESS", 
                                        "EMPLOYER_PHONE", "EMPLOYER_PHONE_EXT", 
                                        "AGENT_ATTORNEY_NAME", "AGENT_ATTORNEY_CITY", "AGENT_ATTORNEY_STATE",
                                        "JOB_TITLE", "SOC_NAME",
                                        "PW_SOURCE", "PW_SOURCE_YEAR", "PW_SOURCE_OTHER", "WAGE_RATE_OF_PAY_FROM",
                                        "WAGE_RATE_OF_PAY_TO", "WAGE_UNIT_OF_PAY",
                                        "WORKSITE_CITY", "WORKSITE_COUNTY", "WORKSITE_POSTAL_CODE", 
                                        "ORIGINAL_CERT_DATE", "PUBLIC_DISCLOSURE_LOCATION"], axis=1)
#format EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['CASE_SUBMITTED'] = pd.to_datetime(processed_data['CASE_SUBMITTED'],infer_datetime_format=True,errors='coerce')
processed_data['DECISION_DATE'] = pd.to_datetime(processed_data['DECISION_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_START_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_START_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_END_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_END_DATE'],infer_datetime_format=True,errors='coerce')
#drop NaT rows because we can't "guess" the specific date
processed_data = processed_data[processed_data.CASE_SUBMITTED != 'NaT']
processed_data = processed_data[processed_data.DECISION_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_START_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_END_DATE != 'NaT']
#add one column as EMP_PERIOD, and drop EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['EMP_PERIOD'] = processed_data['EMPLOYMENT_END_DATE'] - processed_data['EMPLOYMENT_START_DATE']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD']/np.timedelta64(1,'Y')
#train_set = train_set[train_set.EMP_PERIOD != '-']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD'].astype(float)
#add one column as PROCESS_TIME, indicating processing time of visa application
processed_data['PROCESS_TIME'] = processed_data['DECISION_DATE'] - processed_data['CASE_SUBMITTED']
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].map(lambda x: str(x)[:1])
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].astype(float)
processed_data = processed_data.drop(["EMPLOYMENT_START_DATE", "EMPLOYMENT_END_DATE"], axis=1)
processed_data = processed_data.drop(["CASE_SUBMITTED", "DECISION_DATE"], axis=1)

#concatenate the first 2 digit of column SOC_CODE and NAIC_CODE
processed_data['SOC_CODE'] = processed_data['SOC_CODE'].map(lambda x: str(x)[:2])
processed_data['NAICS_CODE'] = processed_data['NAICS_CODE'].map(lambda x: str(x)[:2])
#remove impurity in the column
processed_data = processed_data[processed_data.PW_UNIT_OF_PAY != 'N']
processed_data = processed_data[processed_data.PREVAILING_WAGE != 'N']
#according to google, there are 2080 working hours per year
pw_unit_column = {"Year":1, "Hour":2080, "Month":12, "Week":52, "Bi-Weekly":26}
processed_data['PW_UNIT_OF_PAY'] = processed_data['PW_UNIT_OF_PAY'].replace(pw_unit_column)
#remove ',' in the column value
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('str')
processed_data['PREVAILING_WAGE'] = processed_data.PREVAILING_WAGE.str.replace(',','')
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('float')
#add one column as ANNUAL_SALARY
processed_data['ANNUAL_SALARY'] = processed_data['PREVAILING_WAGE'] * processed_data['PW_UNIT_OF_PAY']
processed_data = processed_data.drop(["PREVAILING_WAGE", "PW_UNIT_OF_PAY"], axis=1)


### Deal with noise, missing values, numerical and categorical data

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
original_set = processed_data.drop(["CASE_STATUS"], axis=1)
original_labels = processed_data[["CASE_STATUS"]].copy()
original_set_num = original_set.drop(["AGENT_REPRESENTING_EMPLOYER", "SOC_CODE", "NAICS_CODE",
                                "FULL_TIME_POSITION", "PW_WAGE_LEVEL", "H1B_DEPENDENT", "WILLFUL_VIOLATOR",
                                "SUPPORT_H1B", "LABOR_CON_AGREE", "WORKSITE_STATE"], axis=1)
original_set_cat = original_set.drop(["TOTAL_WORKERS","NEW_EMPLOYMENT","CONTINUED_EMPLOYMENT",
                                "CHANGE_PREVIOUS_EMPLOYMENT", "NEW_CONCURRENT_EMP", "CHANGE_EMPLOYER",
                                "AMENDED_PETITION", "EMP_PERIOD", "PROCESS_TIME",
                                "ANNUAL_SALARY"], axis=1)
#build the pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler()),])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")),('cat', OneHotEncoder()),])
full_pipeline = ColumnTransformer([("num", num_pipeline, list(original_set_num)),("cat", cat_pipeline, list(original_set_cat)),])

#prepare the data
original_set = full_pipeline.fit_transform(original_set)

#prepare the target
encoder = LabelEncoder()
original_labels = encoder.fit_transform(original_labels)

#get columns after encoding
def get_feature_names(columnTransformer):
    output_features = []
    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)
    return output_features
column_names = get_feature_names(full_pipeline)

  y = column_or_1d(y, warn=True)


In [23]:
original_set.shape

(20327, 122)

### Feature selection

In [0]:
#get feature names after feature selection
def get_feature_names(X, col = column_names):
  try:
    mask = X.get_support() #list of booleans
  except AttributeError:
    mask = X.support_  #Boruta has different attributes from scikit-learn API
  new_features = [] # The list of your K best features
  for bool, feature in zip(mask, col):
    if bool:
      new_features.append(feature)
  return new_features

Boruta feature selection method

In [25]:
!pip install Boruta

Collecting Boruta
[?25l  Downloading https://files.pythonhosted.org/packages/b2/11/583f4eac99d802c79af9217e1eff56027742a69e6c866b295cce6a5a8fc2/Boruta-0.3-py3-none-any.whl (56kB)
[K     |█████▉                          | 10kB 16.1MB/s eta 0:00:01[K     |███████████▋                    | 20kB 1.7MB/s eta 0:00:01[K     |█████████████████▍              | 30kB 2.5MB/s eta 0:00:01[K     |███████████████████████▏        | 40kB 1.7MB/s eta 0:00:01[K     |█████████████████████████████   | 51kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.9MB/s 
Installing collected packages: Boruta
Successfully installed Boruta-0.3


In [26]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
#Boruta feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1)
feat_selector.fit(original_set.toarray(), original_labels)
boruta_set = feat_selector.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(boruta_set.shape[1])
      + " features after applying Boruta feature selection technique")
print(get_feature_names(feat_selector))

Dataset with 122 features is reduced to 5 features after applying Boruta feature selection technique
['PROCESS_TIME', 'ANNUAL_SALARY', 'SOC_CODE_15', 'H1B_DEPENDENT_N', 'H1B_DEPENDENT_Y']


L1-based and tree-based feature selection method

In [27]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
#L1-based feature selection
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter = 2000).fit(original_set, original_labels)
l_model = SelectFromModel(lsvc, prefit=True)
l1_set = l_model.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(l1_set.shape[1])
      + " features after applying L1-based feature selection technique")
print(get_feature_names(l_model))

#tree-based feature selection
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(original_set, original_labels)
tb_model = SelectFromModel(clf, prefit=True)
tr_set = tb_model.transform(original_set)
print("Dataset with " + str(original_set.shape[1]) + " features is reduced to " + str(tr_set.shape[1])
      + " features after applying tree-based feature selection technique")
print(get_feature_names(tb_model))



Dataset with 122 features is reduced to 9 features after applying L1-based feature selection technique
['PROCESS_TIME', 'ANNUAL_SALARY', 'AGENT_REPRESENTING_EMPLOYER_Y', 'SOC_CODE_15', 'NAICS_CODE_54', 'FULL_TIME_POSITION_Y', 'H1B_DEPENDENT_Y', 'WILLFUL_VIOLATOR_N', 'SUPPORT_H1B_Y']
Dataset with 122 features is reduced to 16 features after applying tree-based feature selection technique
['TOTAL_WORKERS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT', 'CHANGE_PREVIOUS_EMPLOYMENT', 'CHANGE_EMPLOYER', 'AMENDED_PETITION', 'EMP_PERIOD', 'PROCESS_TIME', 'ANNUAL_SALARY', 'AGENT_REPRESENTING_EMPLOYER_N', 'AGENT_REPRESENTING_EMPLOYER_Y', 'PW_WAGE_LEVEL_Level I', 'PW_WAGE_LEVEL_Level II', 'PW_WAGE_LEVEL_Level III', 'WORKSITE_STATE_CA', 'WORKSITE_STATE_NY']


### Deal with class imbalance

In [0]:
import joblib
from collections import Counter
#load the datasets
original_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_set.gz')
boruta_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'boruta_set.gz')
l1_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'l1_set.gz')
tr_set = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'tr_set.gz')
#load the labels
original_labels = joblib.load(PROJECT_ROOT_DIR + 'Data/' + 'original_labels.gz')

Rebalance dataset with oversampling technique

In [29]:
from imblearn.over_sampling import RandomOverSampler

#rebalance the dataset using oversampling (random oversampling)
ros = RandomOverSampler(random_state=42)
ros_boruta_set, ros_boruta_labels = ros.fit_resample(boruta_set, original_labels)
print("Class distribution of oversampling with train_set_boruta " + str(sorted(Counter(ros_boruta_labels).items())))

ros_l1_set, ros_l1_labels = ros.fit_resample(l1_set, original_labels)
print("Class distribution of oversampling with train_set_l1 " + str(sorted(Counter(ros_l1_labels).items())))

ros_tr_set, ros_tr_labels = ros.fit_resample(tr_set, original_labels)
print("Class distribution of oversampling with train_set_tr " + str(sorted(Counter(ros_tr_labels).items())))

Class distribution of oversampling with train_set_boruta [(0, 20058), (1, 20058)]
Class distribution of oversampling with train_set_l1 [(0, 20058), (1, 20058)]
Class distribution of oversampling with train_set_tr [(0, 20058), (1, 20058)]




Rebalance dataset with under-sampling technique

In [30]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#rebalance the dataset using undersampling (nearest neightbours)
renn = RepeatedEditedNearestNeighbours()
renn_boruta_set, renn_boruta_labels = renn.fit_resample(boruta_set, original_labels)
print("Class distribution of undersampling with boruta_set " + str(sorted(Counter(renn_boruta_labels).items())))

renn_l1_set, renn_l1_labels = renn.fit_resample(l1_set, original_labels)
print("Class distribution of undersampling with l1_set " + str(sorted(Counter(renn_l1_labels).items())))

renn_tr_set, renn_tr_labels = renn.fit_resample(tr_set, original_labels)
print("Class distribution of undersampling with tr_set " + str(sorted(Counter(renn_tr_labels).items())))

Class distribution of undersampling with boruta_set [(0, 19484), (1, 269)]
Class distribution of undersampling with l1_set [(0, 19548), (1, 269)]
Class distribution of undersampling with tr_set [(0, 19606), (1, 269)]


Rebalance dataset with balanced sampling technique

In [31]:
from imblearn.combine import SMOTEENN
#rebalance the dataset using balanced sampling (SMOTEENN)
smote_enn = SMOTEENN(random_state=0)
smote_boruta_set, smote_boruta_labels = smote_enn.fit_resample(boruta_set, original_labels)
print("Class distribution of balanced sampling with boruta_set " + str(sorted(Counter(smote_boruta_labels).items())))

smote_l1_set, smote_l1_labels = smote_enn.fit_resample(l1_set, original_labels)
print("Class distribution of balanced sampling with l1_set " + str(sorted(Counter(smote_l1_labels).items())))

smote_tr_set, smote_tr_labels = smote_enn.fit_resample(tr_set, original_labels)
print("Class distribution of balanced sampling with tr_set " + str(sorted(Counter(smote_tr_labels).items())))

Class distribution of balanced sampling with boruta_set [(0, 16097), (1, 15546)]
Class distribution of balanced sampling with l1_set [(0, 16067), (1, 16140)]
Class distribution of balanced sampling with tr_set [(0, 19055), (1, 19077)]


### Split train and test set

In [0]:
#split the dataset into train and test set
from sklearn.model_selection import train_test_split

#1. split original_set and original_labels
original_set_train, original_set_test, original_labels_train, original_labels_test = train_test_split(original_set,original_labels, test_size=0.2, random_state=42)

#2. split ros_boruta_set and ros_boruta_labels
ros_boruta_set_train, ros_boruta_set_test, ros_boruta_labels_train, ros_boruta_labels_test = train_test_split(ros_boruta_set,ros_boruta_labels, test_size=0.2, random_state=42)
#3. split ros_l1_set and ros_l1_labels
ros_l1_set_train, ros_l1_set_test, ros_l1_labels_train, ros_l1_labels_test = train_test_split(ros_l1_set,ros_l1_labels, test_size=0.2, random_state=42)
#4. split ros_tr_set and ros_tr_labels
ros_tr_set_train, ros_tr_set_test, ros_tr_labels_train, ros_tr_labels_test = train_test_split(ros_tr_set,ros_tr_labels, test_size=0.2, random_state=42)

#5. split renn_boruta_set and renn_boruta_labels
renn_boruta_set_train, renn_boruta_set_test, renn_boruta_labels_train, renn_boruta_labels_test = train_test_split(renn_boruta_set,renn_boruta_labels, test_size=0.2, random_state=42)
#6. split renn_l1_set and renn_l1_labels
renn_l1_set_train, renn_l1_set_test, renn_l1_labels_train, renn_l1_labels_test = train_test_split(renn_l1_set,renn_l1_labels, test_size=0.2, random_state=42)
#7. split renn_tr_set and renn_tr_labels
renn_tr_set_train, renn_tr_set_test, renn_tr_labels_train, renn_tr_labels_test = train_test_split(renn_tr_set,renn_tr_labels, test_size=0.2, random_state=42)

#8. split smote_boruta_set and smote_boruta_labels
smote_boruta_set_train, smote_boruta_set_test, smote_boruta_labels_train, smote_boruta_labels_test = train_test_split(smote_boruta_set,smote_boruta_labels, test_size=0.2, random_state=42)
#9. split smote_l1_set and smote_l1_labels
smote_l1_set_train, smote_l1_set_test, smote_l1_labels_train, smote_l1_labels_test = train_test_split(smote_l1_set,smote_l1_labels, test_size=0.2, random_state=42)
#10. split smote_tr_set and smote_tr_labels
smote_tr_set_train, smote_tr_set_test, smote_tr_labels_train, smote_tr_labels_test = train_test_split(smote_tr_set,smote_tr_labels, test_size=0.2, random_state=42)


### Dump the dataset

In [33]:
#store original dataset
joblib.dump(original_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(original_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(original_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(original_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_labels_test' + '.gz', compress=('gzip', 3))

#store ros_boruta dataset
joblib.dump(ros_boruta_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_boruta_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store ros_l1 dataset
joblib.dump(ros_l1_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_l1_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_labels_test' + '.gz', compress=('gzip', 3))

#store ros_tr dataset
joblib.dump(ros_tr_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(ros_tr_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_labels_test' + '.gz', compress=('gzip', 3))

#store renn_boruta dataset
joblib.dump(renn_boruta_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_boruta_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store renn_l1 dataset
joblib.dump(renn_l1_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_l1_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_labels_test' + '.gz', compress=('gzip', 3))

#store renn_tr dataset
joblib.dump(renn_tr_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(renn_tr_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_labels_test' + '.gz', compress=('gzip', 3))

#store smote_boruta dataset
joblib.dump(smote_boruta_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_boruta_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_labels_test' + '.gz', compress=('gzip', 3))

#store smote_l1 dataset
joblib.dump(smote_l1_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_l1_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_labels_test' + '.gz', compress=('gzip', 3))

#store smote_tr dataset
joblib.dump(smote_tr_set_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_set_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_set_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_set_test' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_labels_train, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_labels_train' + '.gz', compress=('gzip', 3))
joblib.dump(smote_tr_labels_test, PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_labels_test' + '.gz', compress=('gzip', 3))

['/content/drive/My Drive/ml_project_dataset/smote_tr_labels_test.gz']

# Train the models

### Load the dataset

In [0]:
import joblib
#load the 10 datasets
#1. original dataset
original_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_set_train' + '.gz')
original_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_set_test' + '.gz')
original_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_labels_train' + '.gz')
original_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'original_labels_test' + '.gz')

#2. ros_boruta dataset
ros_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_set_train' + '.gz')
ros_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_set_test' + '.gz')
ros_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_labels_train' + '.gz')
ros_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_boruta_labels_test' + '.gz')

#3. ros_l1 dataset
ros_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_set_train' + '.gz')
ros_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_set_test' + '.gz')
ros_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_labels_train' + '.gz')
ros_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_l1_labels_test' + '.gz')

#4. ros_tr dataset
ros_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_set_train' + '.gz')
ros_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_set_test' + '.gz')
ros_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_labels_train' + '.gz')
ros_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'ros_tr_labels_test' + '.gz')

#5. renn_boruta dataset
renn_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_set_train' + '.gz')
renn_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_set_test' + '.gz')
renn_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_labels_train' + '.gz')
renn_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_boruta_labels_test' + '.gz')

#6. renn_l1 dataset
renn_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_set_train' + '.gz')
renn_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_set_test' + '.gz')
renn_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_labels_train' + '.gz')
renn_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_l1_labels_test' + '.gz')

#7. renn_tr dataset
renn_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_set_train' + '.gz')
renn_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_set_test' + '.gz')
renn_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_labels_train' + '.gz')
renn_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'renn_tr_labels_test' + '.gz')

#8. smote_boruta dataset
smote_boruta_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_set_train' + '.gz')
smote_boruta_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_set_test' + '.gz')
smote_boruta_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_labels_train' + '.gz')
smote_boruta_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_boruta_labels_test' + '.gz')

#9. smote_l1 dataset
smote_l1_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_set_train' + '.gz')
smote_l1_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_set_test' + '.gz')
smote_l1_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_labels_train' + '.gz')
smote_l1_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_l1_labels_test' + '.gz')

#10. smote_tr dataset
smote_tr_set_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_set_train' + '.gz')
smote_tr_set_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_set_test' + '.gz')
smote_tr_labels_train = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_labels_train' + '.gz')
smote_tr_labels_test = joblib.load(PROJECT_ROOT_DIR + 'ml_project_dataset/' + 'smote_tr_labels_test' + '.gz')

#define the dateset list
train_set_list = [original_set_train,ros_boruta_set_train,ros_l1_set_train,ros_tr_set_train, \
                 renn_boruta_set_train,renn_l1_set_train,renn_tr_set_train,smote_boruta_set_train,\
                 smote_l1_set_train,smote_tr_set_train]
train_labels_list = [original_labels_train,ros_boruta_labels_train,ros_l1_labels_train, \
                    ros_tr_labels_train,renn_boruta_labels_train,renn_l1_labels_train \
                    ,renn_tr_labels_train,smote_boruta_labels_train,smote_l1_labels_train,smote_tr_labels_train]
test_set_list = [original_set_test,ros_boruta_set_test,ros_l1_set_test,ros_tr_set_test,renn_boruta_set_test, \
                renn_l1_set_test,renn_tr_set_test,smote_boruta_set_test,smote_l1_set_test,smote_tr_set_test]
test_labels_list = [original_labels_test,ros_boruta_labels_test,ros_l1_labels_test,ros_tr_labels_test, \
                  renn_boruta_labels_test,renn_l1_labels_test,renn_tr_labels_test,smote_boruta_labels_test, \
                  smote_l1_labels_test,smote_tr_labels_test]
dataset_name_list = ["original","ros_boruta","ros_l1","ros_tr","renn_boruta", \
                     "renn_l1","renn_tr","smote_boruta","smote_l1","smote_tr"]

### Tree models

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dt_clf = DecisionTreeClassifier()

In [42]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(dt_clf, train_set_list[i], train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(dt_clf, test_set_list[i], test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.7769967214360824
The confusion matrix of model trained against original is [[3993   26]
 [  20   27]]
The accuracy of model trained against original is 0.9886866699458927
The precision score of model trained against original is 0.5094339622641509
The recall score of model trained against original is 0.574468085106383
The auc score of model trained against original is 0.7839994070717284
The log loss of model trained against original is 0.3907536900391245
Time duration of model trained against original is 1.6076838970184326
Time duration of model test against original is 0.21770739555358887
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.97592916514381
The confusion matrix of model trained against ros_boruta is [[3874  151]
 [  42 3957]]
The accuracy of model trained against ros_boruta is 0.9759471585244267
The precision score of model trained against ros_b

### Linear models

In [0]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
le_clf = LinearSVC(random_state=0, tol=1e-5)

In [43]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(le_clf, train_set_list[i], train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(le_clf, test_set_list[i], test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")



The f2 score of model trained against original is 0.6581709251324499
The confusion matrix of model trained against original is [[4016    3]
 [  34   13]]
The accuracy of model trained against original is 0.9909001475651746
The precision score of model trained against original is 0.8125
The recall score of model trained against original is 0.2765957446808511
The auc score of model trained against original is 0.6379246451694875
The log loss of model trained against original is 0.3142983584367904
Time duration of model trained against original is 4.0134289264678955
Time duration of model test against original is 0.4033355712890625
--------------------------------------------------------------------------




The f2 score of model trained against ros_boruta is 0.852887885442009
The confusion matrix of model trained against ros_boruta is [[3581  444]
 [ 733 3266]]
The accuracy of model trained against ros_boruta is 0.8533150548354935
The precision score of model trained against ros_boruta is 0.8803234501347709
The recall score of model trained against ros_boruta is 0.816704176044011
The auc score of model trained against ros_boruta is 0.8531968085188999
The log loss of model trained against ros_boruta is 5.066362766459234
Time duration of model trained against ros_boruta is 17.090715885162354
Time duration of model test against ros_boruta is 3.1135120391845703
--------------------------------------------------------------------------




The f2 score of model trained against ros_l1 is 0.8272712983803792
The confusion matrix of model trained against ros_l1 is [[3309  716]
 [ 670 3329]]
The accuracy of model trained against ros_l1 is 0.8272681954137587
The precision score of model trained against ros_l1 is 0.822991347342398
The recall score of model trained against ros_l1 is 0.8324581145286322
The auc score of model trained against ros_l1 is 0.8272849578854341
The log loss of model trained against ros_l1 is 5.966016524813837
Time duration of model trained against ros_l1 is 21.98363471031189
Time duration of model test against ros_l1 is 4.407402038574219
--------------------------------------------------------------------------




The f2 score of model trained against ros_tr is 0.8090063433867388
The confusion matrix of model trained against ros_tr is [[3349  676]
 [ 855 3144]]
The accuracy of model trained against ros_tr is 0.80919740777667
The precision score of model trained against ros_tr is 0.8230366492146597
The recall score of model trained against ros_tr is 0.7861965491372843
The auc score of model trained against ros_tr is 0.809123119289139
The log loss of model trained against ros_tr is 6.590155432262097
Time duration of model trained against ros_tr is 28.669178247451782
Time duration of model test against ros_tr is 5.524988889694214
--------------------------------------------------------------------------
The f2 score of model trained against renn_boruta is 0.7033056967231599
The confusion matrix of model trained against renn_boruta is [[3892    0]
 [  38   21]]
The accuracy of model trained against renn_boruta is 0.9903821817261452
The precision score of model trained against renn_boruta is 1.0
The 



The f2 score of model trained against renn_tr is 0.6703171257965779
The confusion matrix of model trained against renn_tr is [[3924    0]
 [  36   15]]
The accuracy of model trained against renn_tr is 0.9909433962264151
The precision score of model trained against renn_tr is 1.0
The recall score of model trained against renn_tr is 0.29411764705882354
The auc score of model trained against renn_tr is 0.6470588235294118
The log loss of model trained against renn_tr is 0.31280401263315427
Time duration of model trained against renn_tr is 3.3768258094787598
Time duration of model test against renn_tr is 0.52420973777771
--------------------------------------------------------------------------




The f2 score of model trained against smote_boruta is 0.9221918540715213
The confusion matrix of model trained against smote_boruta is [[3104  210]
 [ 279 2736]]
The accuracy of model trained against smote_boruta is 0.9227366092589666
The precision score of model trained against smote_boruta is 0.9287169042769857
The recall score of model trained against smote_boruta is 0.9074626865671642
The auc score of model trained against smote_boruta is 0.9220475774416992
The log loss of model trained against smote_boruta is 2.668609507437458
Time duration of model trained against smote_boruta is 6.212921380996704
Time duration of model test against smote_boruta is 1.5230612754821777
--------------------------------------------------------------------------




The f2 score of model trained against smote_l1 is 0.8811728554591265
The confusion matrix of model trained against smote_l1 is [[2891  347]
 [ 418 2786]]
The accuracy of model trained against smote_l1 is 0.8812480596088171
The precision score of model trained against smote_l1 is 0.8892435365464411
The recall score of model trained against smote_l1 is 0.869538077403246
The auc score of model trained against smote_l1 is 0.8811865803940256
The log loss of model trained against smote_l1 is 4.10158978615569
Time duration of model trained against smote_l1 is 11.057973384857178
Time duration of model test against smote_l1 is 2.2915594577789307
--------------------------------------------------------------------------




The f2 score of model trained against smote_tr is 0.8312017755773644
The confusion matrix of model trained against smote_tr is [[3222  605]
 [ 682 3118]]
The accuracy of model trained against smote_tr is 0.831257375114724
The precision score of model trained against smote_tr is 0.8374966424926135
The recall score of model trained against smote_tr is 0.8205263157894737
The auc score of model trained against smote_tr is 0.831219520580914
The log loss of model trained against smote_tr is 5.8282272160345245
Time duration of model trained against smote_tr is 28.62503147125244
Time duration of model test against smote_tr is 5.096831560134888
--------------------------------------------------------------------------


### Distance-based models

In [0]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
n_neighbors = 3
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')

In [45]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(knn_clf, train_set_list[i], train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(knn_clf, test_set_list[i], test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.6809456129280018
The confusion matrix of model trained against original is [[4016    3]
 [  32   15]]
The accuracy of model trained against original is 0.9913920314805705
The precision score of model trained against original is 0.8333333333333334
The recall score of model trained against original is 0.3191489361702128
The auc score of model trained against original is 0.6592012409141683
The log loss of model trained against original is 0.2973092898706759
Time duration of model trained against original is 13.994706392288208
Time duration of model test against original is 0.9218149185180664
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.968956446243028
The confusion matrix of model trained against ros_boruta is [[3852  173]
 [  76 3923]]
The accuracy of model trained against ros_boruta is 0.9689680957128615
The precision score of model trained against ros_

### Probabilistic models

In [0]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
gnb_clf = GaussianNB()

In [48]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(gnb_clf, train_set_list[i].toarray(), train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(gnb_clf, test_set_list[i].toarray(), test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.23766671705907808
The confusion matrix of model trained against original is [[1499 2520]
 [  20   27]]
The accuracy of model trained against original is 0.3753074274471225
The precision score of model trained against original is 0.01060070671378092
The recall score of model trained against original is 0.574468085106383
The auc score of model trained against original is 0.4737232189652344
The log loss of model trained against original is 21.576612648449885
Time duration of model trained against original is 0.48027515411376953
Time duration of model test against original is 0.08147215843200684
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.7014121119024469
The confusion matrix of model trained against ros_boruta is [[4007   18]
 [2201 1798]]
The accuracy of model trained against ros_boruta is 0.7234546360917248
The precision score of model trained against 

### Ensemble models

####Bagging

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [52]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(rf_clf, train_set_list[i], train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(rf_clf, test_set_list[i], test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.7599427953440065
The confusion matrix of model trained against original is [[4018    1]
 [  25   22]]
The accuracy of model trained against original is 0.9936055090998525
The precision score of model trained against original is 0.9565217391304348
The recall score of model trained against original is 0.46808510638297873
The auc score of model trained against original is 0.73391814413451
The log loss of model trained against original is 0.2208580880140462
Time duration of model trained against original is 32.48704671859741
Time duration of model test against original is 5.140089750289917
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.976179554508792
The confusion matrix of model trained against ros_boruta is [[3876  149]
 [  42 3957]]
The accuracy of model trained against ros_boruta is 0.9761964107676969
The precision score of model trained against ros_bor

#### Boosting

In [0]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ad_clf = AdaBoostClassifier(n_estimators=100)

In [54]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(ad_clf, train_set_list[i], train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(ad_clf, test_set_list[i], test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.7842444751265181
The confusion matrix of model trained against original is [[4013    6]
 [  22   25]]
The accuracy of model trained against original is 0.9931136251844564
The precision score of model trained against original is 0.8064516129032258
The recall score of model trained against original is 0.5319148936170213
The auc score of model trained against original is 0.7652109924666346
The log loss of model trained against original is 0.2378481398529474
Time duration of model trained against original is 27.47443175315857
Time duration of model test against original is 5.423798084259033
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.8964252476083332
The confusion matrix of model trained against ros_boruta is [[3686  339]
 [ 491 3508]]
The accuracy of model trained against ros_boruta is 0.8965603190428714
The precision score of model trained against ros_b

#### Hybrid

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,roc_auc_score, log_loss,roc_curve
import time

#apply ftwo score to evaluate models
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dt_clf = DecisionTreeClassifier()
n_neighbors = 3
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
gnb_clf = GaussianNB()
vt_clf = VotingClassifier(estimators=[('dt', dt_clf), ('knn', knn_clf), ('gnb', gnb_clf)], voting='hard')

In [56]:
for i in range(10):
  #train the model against 10 dataset
  train_start_time = time.time()
  cross_val_score(vt_clf, train_set_list[i].toarray(), train_labels_list[i], scoring = ftwo_scorer, cv=10)
  train_end_time = time.time()
  train_duration = train_end_time - train_start_time
  test_start_time = time.time()
  prediction = cross_val_predict(vt_clf, test_set_list[i].toarray(), test_labels_list[i], cv=10)
  test_end_time = time.time()
  test_duration = test_end_time - test_start_time

  f2_score = fbeta_score(test_labels_list[i], prediction, average='macro', beta=2)
  cfm = confusion_matrix(test_labels_list[i], prediction)
  accuracy = accuracy_score(test_labels_list[i], prediction)
  precision = precision_score(test_labels_list[i], prediction)
  recall = recall_score(test_labels_list[i], prediction)
  roc_auc = roc_auc_score(test_labels_list[i], prediction)
  log_score = log_loss(test_labels_list[i], prediction)

  print("The f2 score of model trained against " + dataset_name_list[i] + " is " + str(f2_score))
  print("The confusion matrix of model trained against " + dataset_name_list[i] + " is " + str(cfm))
  print("The accuracy of model trained against " + dataset_name_list[i] + " is " + str(accuracy))
  print("The precision score of model trained against " + dataset_name_list[i] + " is " + str(precision))
  print("The recall score of model trained against " + dataset_name_list[i] + " is " + str(recall))
  print("The auc score of model trained against " + dataset_name_list[i] + " is " + str(roc_auc))
  print("The log loss of model trained against " + dataset_name_list[i] + " is " + str(log_score))
  print("Time duration of model trained against " + dataset_name_list[i] + " is " + str(train_duration))
  print("Time duration of model test against " + dataset_name_list[i] + " is " + str(test_duration))
  print("--------------------------------------------------------------------------")

The f2 score of model trained against original is 0.6986442059790463
The confusion matrix of model trained against original is [[3991   28]
 [  28   19]]
The accuracy of model trained against original is 0.9862272503689129
The precision score of model trained against original is 0.40425531914893614
The recall score of model trained against original is 0.40425531914893614
The auc score of model trained against original is 0.6986442059790463
The log loss of model trained against original is 0.47569942617881145
Time duration of model trained against original is 42.096699237823486
Time duration of model test against original is 3.502889394760132
--------------------------------------------------------------------------
The f2 score of model trained against ros_boruta is 0.9780633155944145
The confusion matrix of model trained against ros_boruta is [[3900  125]
 [  51 3948]]
The accuracy of model trained against ros_boruta is 0.9780658025922233
The precision score of model trained against r