<a href="https://colab.research.google.com/github/EmperoR1127/ml_project/blob/emperor/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from scipy.io import arff
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "/content/drive/My Drive/"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "Images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [0]:
#load the dataset
path = "/content/drive/My Drive/Data/H-1B_Disclosure_RAW_Data.csv"
df = pd.read_csv(path, encoding='utf-8')
processed_data = df.copy()

#Feature Engineering

#####Drop correlated columns and create new columns

In [0]:
processed_data = processed_data.drop(["CASE_NUMBER", "VISA_CLASS", 
                                        "EMPLOYER_NAME", "EMPLOYER_STATE","EMPLOYER_POSTAL_CODE", 
                                        "EMPLOYER_CITY", "EMPLOYER_BUSINESS_DBA", 
                                        "EMPLOYER_COUNTRY", "EMPLOYER_PROVINCE", "EMPLOYER_ADDRESS", 
                                        "EMPLOYER_PHONE", "EMPLOYER_PHONE_EXT", 
                                        "AGENT_ATTORNEY_NAME", "AGENT_ATTORNEY_CITY", "AGENT_ATTORNEY_STATE",
                                        "JOB_TITLE", "SOC_NAME",
                                        "PW_SOURCE", "PW_SOURCE_YEAR", "PW_SOURCE_OTHER", "WAGE_RATE_OF_PAY_FROM",
                                        "WAGE_RATE_OF_PAY_TO", "WAGE_UNIT_OF_PAY",
                                        "WORKSITE_CITY", "WORKSITE_COUNTY", "WORKSITE_POSTAL_CODE", 
                                        "ORIGINAL_CERT_DATE", "PUBLIC_DISCLOSURE_LOCATION"], axis=1)
#format EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['CASE_SUBMITTED'] = pd.to_datetime(processed_data['CASE_SUBMITTED'],infer_datetime_format=True,errors='coerce')
processed_data['DECISION_DATE'] = pd.to_datetime(processed_data['DECISION_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_START_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_START_DATE'],infer_datetime_format=True,errors='coerce')
processed_data['EMPLOYMENT_END_DATE'] = pd.to_datetime(processed_data['EMPLOYMENT_END_DATE'],infer_datetime_format=True,errors='coerce')
#drop NaT rows because we can't "guess" the specific date
processed_data = processed_data[processed_data.CASE_SUBMITTED != 'NaT']
processed_data = processed_data[processed_data.DECISION_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_START_DATE != 'NaT']
processed_data = processed_data[processed_data.EMPLOYMENT_END_DATE != 'NaT']
#add one column as EMP_PERIOD, and drop EMPLOYMENT_START_DATE and EMPLOYMENT_END_DATE
processed_data['EMP_PERIOD'] = processed_data['EMPLOYMENT_END_DATE'] - processed_data['EMPLOYMENT_START_DATE']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD']/np.timedelta64(1,'Y')
#train_set = train_set[train_set.EMP_PERIOD != '-']
processed_data['EMP_PERIOD'] = processed_data['EMP_PERIOD'].astype(float)
#add one column as PROCESS_TIME, indicating processing time of visa application
processed_data['PROCESS_TIME'] = processed_data['DECISION_DATE'] - processed_data['CASE_SUBMITTED']
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].map(lambda x: str(x)[:1])
processed_data['PROCESS_TIME'] = processed_data['PROCESS_TIME'].astype(float)
processed_data = processed_data.drop(["EMPLOYMENT_START_DATE", "EMPLOYMENT_END_DATE"], axis=1)
processed_data = processed_data.drop(["CASE_SUBMITTED", "DECISION_DATE"], axis=1)

#concatenate the first 2 digit of column SOC_CODE and NAIC_CODE
processed_data['SOC_CODE'] = processed_data['SOC_CODE'].map(lambda x: str(x)[:2])
processed_data['NAICS_CODE'] = processed_data['NAICS_CODE'].map(lambda x: str(x)[:2])
#remove impurity in the column
processed_data = processed_data[processed_data.PW_UNIT_OF_PAY != 'N']
processed_data = processed_data[processed_data.PREVAILING_WAGE != 'N']
#according to google, there are 2080 working hours per year
pw_unit_column = {"Year":1, "Hour":2080, "Month":12, "Week":52, "Bi-Weekly":26}
processed_data['PW_UNIT_OF_PAY'] = processed_data['PW_UNIT_OF_PAY'].replace(pw_unit_column)
#remove ',' in the column value
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('str')
processed_data['PREVAILING_WAGE'] = processed_data.PREVAILING_WAGE.str.replace(',','')
processed_data['PREVAILING_WAGE'] = processed_data['PREVAILING_WAGE'].astype('float')
#add one column as ANNUAL_SALARY
processed_data['ANNUAL_SALARY'] = processed_data['PREVAILING_WAGE'] * processed_data['PW_UNIT_OF_PAY']
processed_data = processed_data.drop(["PREVAILING_WAGE", "PW_UNIT_OF_PAY"], axis=1)
counts_soc = processed_data.groupby("SOC_CODE")["SOC_CODE"].transform(len)
counts_naics = processed_data.groupby("NAICS_CODE")["NAICS_CODE"].transform(len)
counts_worksite_state = processed_data.groupby("WORKSITE_STATE")["WORKSITE_STATE"].transform(len)
mask = (counts_soc > 10000) & (counts_naics > 10000)  & (counts_worksite_state > 20000)
processed_data = processed_data[mask]


### Deal with noise, missing values, numerical and categorical data

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
train_set = processed_data.drop(["CASE_STATUS"], axis=1)
train_labels = processed_data[["CASE_STATUS"]].copy()
train_set_num = train_set.drop(["AGENT_REPRESENTING_EMPLOYER", "SOC_CODE", "NAICS_CODE",
                                "FULL_TIME_POSITION", "PW_WAGE_LEVEL", "H1B_DEPENDENT", "WILLFUL_VIOLATOR",
                                "SUPPORT_H1B", "LABOR_CON_AGREE", "WORKSITE_STATE"], axis=1)
train_set_cat = train_set.drop(["TOTAL_WORKERS","NEW_EMPLOYMENT","CONTINUED_EMPLOYMENT",
                                "CHANGE_PREVIOUS_EMPLOYMENT", "NEW_CONCURRENT_EMP", "CHANGE_EMPLOYER",
                                "AMENDED_PETITION", "EMP_PERIOD", "PROCESS_TIME",
                                "ANNUAL_SALARY"], axis=1)
#build the pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler()),])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")),('cat', OneHotEncoder()),])
full_pipeline = ColumnTransformer([("num", num_pipeline, list(train_set_num)),("cat", cat_pipeline, list(train_set_cat)),])

#prepare the data
train_set = full_pipeline.fit_transform(train_set)

#prepare the target
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_labels)

  y = column_or_1d(y, warn=True)


### Feature selection

Boruta feature selection method

In [0]:
#!pip install Boruta
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
#Boruta feature selection
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
feat_selector.fit(train_set, train_labels)
train_set_boruta = feat_selector.transform(train_set)
print("Dataset with " + str(train_set.shape[1]) + " features is reduced to " + str(train_set_boruta.shape[1])
      + " features after applying Boruta feature selection technique")

L1-based and tree-based feature selection method

In [0]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
#L1-based feature selection
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter = 2000).fit(train_set, train_labels)
l_model = SelectFromModel(lsvc, prefit=True)
train_set_l1 = l_model.transform(train_set)
print("Dataset with " + str(train_set.shape[1]) + " features is reduced to " + str(train_set_l1.shape[1])
      + " features after applying L1-based feature selection technique")

#tree-based feature selection
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train_set, train_labels)
tb_model = SelectFromModel(clf, prefit=True)
train_set_tr = tb_model.transform(train_set)
print("Dataset with " + str(train_set.shape[1]) + " features is reduced to " + str(train_set_tr.shape[1])
      + " features after applying tree-based feature selection technique")



Dataset with 50 features is reduced to 25 features after applying L1-based feature selection technique
Dataset with 50 features is reduced to 4 features after applying tree-based feature selection technique


In [0]:
#store dataset in files using dump
import joblib
joblib.dump(train_set, PROJECT_ROOT_DIR + 'Data/' + 'train_set' + '.gz', compress=('gzip', 3))
joblib.dump(train_set_boruta, PROJECT_ROOT_DIR + 'Data/' + 'train_set_boruta' + '.gz', compress=('gzip', 3))
joblib.dump(train_set_l1, PROJECT_ROOT_DIR + 'Data/' + 'train_set_l1' + '.gz', compress=('gzip', 3))
joblib.dump(train_set_tr, PROJECT_ROOT_DIR + 'Data/' + 'train_set_tr' + '.gz', compress=('gzip', 3))

#store target in files using dump
joblib.dump(train_labels, PROJECT_ROOT_DIR + 'Data/' + 'train_labels' + '.gz', compress=('gzip', 3))

### Deal with class imbalance

In [0]:
import joblib
from collections import Counter
#load the datasets
train_set = joblib.load('/content/drive/My Drive/Data/train_set.gz')
train_set_boruta = joblib.load('/content/drive/My Drive/Data/train_set_boruta.gz')
train_set_l1 = joblib.load('/content/drive/My Drive/Data/train_set_l1.gz')
train_set_tr = joblib.load('/content/drive/My Drive/Data/train_set_tr.gz')
#load the labels
train_labels = joblib.load('/content/drive/My Drive/Data/train_labels.gz')

Rebalance dataset with oversampling technique

In [0]:
from imblearn.over_sampling import RandomOverSampler

#rebalance the dataset using oversampling (random oversampling)
ros = RandomOverSampler(random_state=42)
ros_train_set_boruta, ros_train_labels_boruta = ros.fit_resample(train_set_boruta, train_labels)
print("Class distribution of oversampling with train_set_boruta " + str(sorted(Counter(ros_train_labels_boruta).items())))

ros_train_set_l1, ros_train_labels_l1 = ros.fit_resample(train_set_l1, train_labels)
print("Class distribution of oversampling with train_set_l1 " + str(sorted(Counter(ros_train_labels_l1).items())))

ros_train_set_tr, ros_train_labels_tr = ros.fit_resample(train_set_tr, train_labels)
print("Class distribution of oversampling with train_set_tr " + str(sorted(Counter(ros_train_labels_tr).items())))




Class distribution of oversampling with train_set_boruta [(0, 347594), (1, 347594)]
Class distribution of oversampling with train_set_l1 [(0, 347594), (1, 347594)]
Class distribution of oversampling with train_set_tr [(0, 347594), (1, 347594)]


In [0]:
#store ros_train_set_boruta and ros_train_labels_boruta
joblib.dump(ros_train_set_boruta, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_set_boruta' + '.gz', compress=('gzip', 3))
joblib.dump(ros_train_labels_boruta, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_labels_boruta' + '.gz', compress=('gzip', 3))
#store ros_train_set_l1 and ros_train_labels_l1
joblib.dump(ros_train_set_l1, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_set_l1' + '.gz', compress=('gzip', 3))
joblib.dump(ros_train_labels_l1, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_labels_l1' + '.gz', compress=('gzip', 3))
#store ros_train_set_tr and ros_train_labels_tr
joblib.dump(ros_train_set_tr, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_set_tr' + '.gz', compress=('gzip', 3))
joblib.dump(ros_train_labels_tr, PROJECT_ROOT_DIR + 'Data/' + 'ros_train_labels_tr' + '.gz', compress=('gzip', 3))

['/content/drive/My Drive/Data/ros_train_labels_tr.gz']

Rebalance dataset with under-sampling technique

In [0]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#rebalance the dataset using undersampling (nearest neightbours)
renn = RepeatedEditedNearestNeighbours()
renn_train_set_boruta, renn_train_labels_boruta = renn.fit_resample(train_set_boruta, train_labels)
print("Class distribution of undersampling with train_set_boruta " + str(sorted(Counter(renn_train_labels_boruta).items())))

In [0]:
#store renn_train_set_boruta and renn_train_labels_boruta
joblib.dump(renn_train_set_boruta, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_set_boruta' + '.gz', compress=('gzip', 3))
joblib.dump(renn_train_labels_boruta, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_labels_boruta' + '.gz', compress=('gzip', 3))

In [8]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours()
renn_train_set_l1, renn_train_labels_l1 = renn.fit_resample(train_set_l1, train_labels)
print("Class distribution of undersampling with train_set_l1 " + str(sorted(Counter(renn_train_labels_l1).items())))

Class distribution of undersampling with train_set_l1 [(0, 341495), (1, 4044)]


In [9]:
#store renn_train_set_l1 and renn_train_labels_l1
joblib.dump(renn_train_set_l1, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_set_l1' + '.gz', compress=('gzip', 3))
joblib.dump(renn_train_labels_l1, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_labels_l1' + '.gz', compress=('gzip', 3))

['/content/drive/My Drive/Data/renn_train_labels_l1.gz']

In [0]:
renn_train_set_tr, renn_train_labels_tr = renn.fit_resample(train_set_tr, train_labels)
print("Class distribution of undersampling with train_set_tr " + str(sorted(Counter(renn_train_labels_tr).items())))

In [0]:
#store renn_train_set_tr and renn_train_labels_tr
joblib.dump(renn_train_set_tr, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_set_tr' + '.gz', compress=('gzip', 3))
joblib.dump(renn_train_labels_tr, PROJECT_ROOT_DIR + 'Data/' + 'renn_train_labels_tr' + '.gz', compress=('gzip', 3))

Rebalance dataset with balanced sampling technique

In [0]:
from imblearn.combine import SMOTEENN
#rebalance the dataset using balanced sampling (SMOTEENN)
smote_enn = SMOTEENN(random_state=0)
smote_train_set_boruta, smote_train_labels_boruta = smote_enn.fit_resample(train_set_boruta, train_labels)
print("Class distribution of balanced sampling with train_set_boruta " + str(sorted(Counter(smote_train_labels_boruta).items())))

Class distribution of balanced sampling with train_set_boruta [(0, 324048), (1, 314207)]


In [0]:
smote_train_set_l1, smote_train_labels_l1 = smote_enn.fit_resample(train_set_l1, train_labels)
print("Class distribution of balanced sampling with train_set_l1 " + str(sorted(Counter(smote_train_labels_l1).items())))

Class distribution of balanced sampling with train_set_l1 [(0, 334212), (1, 324659)]


In [0]:
smote_train_set_tr, smote_train_labels_tr = smote_enn.fit_resample(train_set_tr, train_labels)
print("Class distribution of balanced sampling with smote_train_set_tr " + str(sorted(Counter(smote_train_labels_tr).items())))

Class distribution of balanced sampling with train_set_boruta [(0, 315284), (1, 297780)]


In [0]:
#store ros_train_set_tr and ros_train_labels_tr
joblib.dump(smote_train_set_boruta, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_set_boruta' + '.gz', compress=('gzip', 3))
joblib.dump(smote_train_labels_boruta, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_labels_boruta' + '.gz', compress=('gzip', 3))

#store smote_train_set_l1 and smote_train_labels_l1
joblib.dump(smote_train_set_l1, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_set_l1' + '.gz', compress=('gzip', 3))
joblib.dump(smote_train_labels_l1, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_labels_l1' + '.gz', compress=('gzip', 3))

#store smote_train_set_tr and smote_train_labels_tr
joblib.dump(smote_train_set_tr, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_set_tr' + '.gz', compress=('gzip', 3))
joblib.dump(smote_train_labels_tr, PROJECT_ROOT_DIR + 'Data/' + 'smote_train_labels_tr' + '.gz', compress=('gzip', 3))

['/content/drive/My Drive/Data/smote_train_labels_tr.gz']

# Train the models

In [0]:
#load the 10 datasets
#1. original dataset
train_set = joblib.load('/content/drive/My Drive/Data/train_set.gz')
train_labels = joblib.load('/content/drive/My Drive/Data/train_labels.gz')
#2. oversampling dataset with boruta feature selection technique
ros_train_set_boruta = joblib.load('/content/drive/My Drive/Data/ros_train_set_boruta.gz')
ros_train_labels_boruta = joblib.load('/content/drive/My Drive/Data/ros_train_labels_boruta.gz')
#3. oversampling dataset with L1-based feature selection technique
ros_train_set_l1 = joblib.load('/content/drive/My Drive/Data/ros_train_set_l1.gz')
ros_train_labels_l1 = joblib.load('/content/drive/My Drive/Data/ros_train_labels_l1.gz')
#4. oversampling dataset with tree-based feature selection technique
ros_train_set_tr = joblib.load('/content/drive/My Drive/Data/ros_train_set_tr.gz')
ros_train_labels_tr = joblib.load('/content/drive/My Drive/Data/ros_train_labels_tr.gz')
#5. under-sampling dataset with boruta feature selection technique
renn_train_set_boruta = joblib.load('/content/drive/My Drive/Data/renn_train_set_boruta.gz')
renn_train_labels_boruta = joblib.load('/content/drive/My Drive/Data/renn_train_labels_boruta.gz')
#6. under-sampling dataset with L1-based feature selection technique
renn_train_set_l1 = joblib.load('/content/drive/My Drive/Data/renn_train_set_l1.gz')
renn_train_labels_l1 = joblib.load('/content/drive/My Drive/Data/renn_train_labels_l1.gz')
#7. under-sampling dataset with tree-based feature selection technique
renn_train_set_tr = joblib.load('/content/drive/My Drive/Data/renn_train_set_tr.gz')
renn_train_labels_tr = joblib.load('/content/drive/My Drive/Data/renn_train_labels_tr.gz')
#8. balanced sampling dataset with boruta feature selection technique
smote_train_set_boruta = joblib.load('/content/drive/My Drive/Data/smote_train_set_boruta.gz')
smote_train_labels_boruta = joblib.load('/content/drive/My Drive/Data/smote_train_labels_boruta.gz')
#9. balanced sampling dataset with L1-based feature selection technique
smote_train_set_l1 = joblib.load('/content/drive/My Drive/Data/smote_train_set_l1.gz') 
smote_train_labels_l1 = joblib.load('/content/drive/My Drive/Data/smote_train_labels_l1.gz')
#10. balanced sampling dataset with tree-based feature selection technique
smote_train_set_tr = joblib.load('/content/drive/My Drive/Data/smote_train_set_tr.gz')
smote_train_labels_tr = joblib.load('/content/drive/My Drive/Data/smote_train_labels_tr.gz')

### Tree models

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, accuracy_score
dt_clf = DecisionTreeClassifier()

In [15]:
#train the model against the original dataset
dt_or_prediction = cross_val_predict(dt_clf.fit(train_set, train_labels), 
                                           train_set, train_labels, cv=10)
#calculate the presion and recall of original dataset
dt_or_precision_score = precision_score(train_labels, dt_or_prediction)
dt_or_recall_score = recall_score(train_labels, dt_or_prediction)
dt_or_accuracy_score = accuracy_score(train_labels, dt_or_prediction)
print("precision of decision tree model on original dataset is + %f" % dt_or_precision_score)
print("recall of decision tree model on original dataset is + %f" % dt_or_recall_score)
print("accuracy of decision tree model on original dataset is + %f" % dt_or_accuracy_score)

precision of decision tree model on original dataset is + 0.388561
recall of decision tree model on original dataset is + 0.618200
accuracy of decision tree model on original dataset is + 0.984421


In [17]:
#train the model against ros_train_set_boruta, ros_train_labels_boruta
dt_ros_boruta_prediction = cross_val_predict(dt_clf.fit(ros_train_set_boruta, ros_train_labels_boruta), 
                                                       ros_train_set_boruta, ros_train_labels_boruta, cv=10)

#calculate the presion and recall of dt_ros_boruta_prediction
dt_ros_boruta_precision_score = precision_score(ros_train_labels_boruta, dt_ros_boruta_prediction)
dt_ros_boruta_recall_score = recall_score(ros_train_labels_boruta, dt_ros_boruta_prediction)
dt_ros_boruta_accuracy_score = accuracy_score(ros_train_labels_boruta, dt_ros_boruta_prediction)
print("precision of decision tree model on oversampling and boruta dataset is + %f" % dt_ros_boruta_precision_score)
print("recall of decision tree model on oversampling and boruta dataset is + %f" % dt_ros_boruta_recall_score)
print("accuracy of decision tree model on oversampling and boruta dataset is + %f" % dt_ros_boruta_accuracy_score)

precision of decision tree model on oversampling and boruta dataset is + 0.978310
recall of decision tree model on oversampling and boruta dataset is + 0.995662
accuracy of decision tree model on oversampling and boruta dataset is + 0.986794


In [18]:
#train the model against ros_train_set_l1, ros_train_labels_l1
dt_ros_l1_prediction = cross_val_predict(dt_clf.fit(ros_train_set_l1, ros_train_labels_l1), 
                                                   ros_train_set_l1, ros_train_labels_l1, cv=10)

#calculate the presion and recall of dt_ros_l1_prediction
dt_ros_l1_precision_score = precision_score(ros_train_labels_l1, dt_ros_l1_prediction)
dt_ros_l1_recall_score = recall_score(ros_train_labels_l1, dt_ros_l1_prediction)
dt_ros_l1_accuracy_score = accuracy_score(ros_train_labels_l1, dt_ros_l1_prediction)
print("precision of decision tree model on oversampling and L1-based dataset is + %f" % dt_ros_l1_precision_score)
print("recall of decision tree model on oversampling and L1-based dataset is + %f" % dt_ros_l1_recall_score)
print("accuracy of decision tree model on oversampling and L1-based dataset is + %f" % dt_ros_l1_accuracy_score)

precision of decision tree model on oversampling and L1-based dataset is + 0.981379
recall of decision tree model on oversampling and L1-based dataset is + 0.996933
accuracy of decision tree model on oversampling and L1-based dataset is + 0.989009


In [19]:
#train the against ros_train_set_tr, ros_train_labels_tr
dt_ros_tr_prediction = cross_val_predict(dt_clf.fit(ros_train_set_tr, 
                                                             ros_train_labels_tr), 
                                                  ros_train_set_tr, ros_train_labels_tr, cv=10)

#calculate the presion and recall of dt_ros_tr_prediction
dt_ros_tr_precision_score = precision_score(ros_train_labels_tr, dt_ros_tr_prediction)
dt_ros_tr_recall_score = recall_score(ros_train_labels_tr, dt_ros_tr_prediction)
dt_ros_tr_accuracy_score = accuracy_score(ros_train_labels_tr, dt_ros_tr_prediction)
print("precision of decision tree model on oversampling and tree-based dataset is + %f" % dt_ros_tr_precision_score)
print("recall of decision tree model on oversampling and tree-based dataset is + %f" % dt_ros_tr_recall_score)
print("accuracy of decision tree model on oversampling and tree-based dataset is + %f" % dt_ros_tr_accuracy_score)

precision of decision tree model on oversampling and tree-based dataset is + 0.961970
recall of decision tree model on oversampling and tree-based dataset is + 0.980811
accuracy of decision tree model on oversampling and tree-based dataset is + 0.971018


In [21]:
#train the model against renn_train_set_boruta, renn_train_labels_boruta
dt_renn_boruta_prediction = cross_val_predict(dt_clf.fit(renn_train_set_boruta, renn_train_labels_boruta), 
                                                       renn_train_set_boruta, renn_train_labels_boruta, cv=10)

#calculate the presion and recall of dt_renn_boruta_prediction
dt_renn_boruta_precision_score = precision_score(renn_train_labels_boruta, dt_renn_boruta_prediction)
dt_renn_boruta_recall_score = recall_score(renn_train_labels_boruta, dt_renn_boruta_prediction)
dt_renn_boruta_accuracy_score = accuracy_score(renn_train_labels_boruta, dt_renn_boruta_prediction)
print("precision of decision tree model on under-sampling and boruta dataset is + %f" % dt_renn_boruta_precision_score)
print("recall of decision tree model on under-sampling and boruta dataset is + %f" % dt_renn_boruta_recall_score)
print("accuracy of decision tree model on under-sampling and boruta dataset is + %f" % dt_renn_boruta_accuracy_score)

precision of decision tree model on under-sampling and boruta dataset is + 0.501092
recall of decision tree model on under-sampling and boruta dataset is + 0.680514
accuracy of decision tree model on under-sampling and boruta dataset is + 0.988314


In [22]:
#train the model against renn_train_set_l1, renn_train_labels_l1
dt_renn_l1_prediction = cross_val_predict(dt_clf.fit(renn_train_set_l1, renn_train_labels_l1), 
                                                   renn_train_set_l1, renn_train_labels_l1, cv=10)

#calculate the presion and recall of dt_renn_l1_prediction
dt_renn_l1_precision_score = precision_score(renn_train_labels_l1, dt_renn_l1_prediction)
dt_renn_l1_recall_score = recall_score(renn_train_labels_l1, dt_renn_l1_prediction)
dt_renn_l1_accuracy_score = accuracy_score(renn_train_labels_l1, dt_renn_l1_prediction)
print("precision of decision tree model on under-sampling and L1-based dataset is + %f" % dt_renn_l1_precision_score)
print("recall of decision tree model on under-sampling and L1-based dataset is + %f" % dt_renn_l1_recall_score)
print("accuracy of decision tree model on under-sampling and L1-based dataset is + %f" % dt_renn_l1_accuracy_score)

precision of decision tree model on under-sampling and L1-based dataset is + 0.448218
recall of decision tree model on under-sampling and L1-based dataset is + 0.662463
accuracy of decision tree model on under-sampling and L1-based dataset is + 0.986505


In [25]:
#train the against renn_train_set_tr, ros_train_labels_tr
dt_renn_tr_prediction = cross_val_predict(dt_clf.fit(renn_train_set_tr, 
                                                             renn_train_labels_tr), 
                                                  renn_train_set_tr, renn_train_labels_tr, cv=10)

#calculate the presion and recall of dt_renn_tr_prediction
dt_renn_tr_precision_score = precision_score(renn_train_labels_tr, dt_renn_tr_prediction)
dt_renn_tr_recall_score = recall_score(renn_train_labels_tr, dt_renn_tr_prediction)
dt_renn_tr_accuracy_score = accuracy_score(renn_train_labels_tr, dt_renn_tr_prediction)
print("precision of decision tree model on under-sampling and tree-based dataset is + %f" % dt_renn_tr_precision_score)
print("recall of decision tree model on under-sampling and tree-based dataset is + %f" % dt_renn_tr_recall_score)
print("accuracy of decision tree model on under-sampling and tree-based dataset is + %f" % dt_renn_tr_accuracy_score)

precision of decision tree model on under-sampling and tree-based dataset is + 0.541247
recall of decision tree model on under-sampling and tree-based dataset is + 0.665183
accuracy of decision tree model on under-sampling and tree-based dataset is + 0.989432


In [26]:
#train the model against smote_train_set_boruta, smote_train_labels_boruta
dt_smote_boruta_prediction = cross_val_predict(dt_clf.fit(smote_train_set_boruta, smote_train_labels_boruta), 
                                                       smote_train_set_boruta, smote_train_labels_boruta, cv=10)

#calculate the presion and recall of dt_smote_boruta_prediction
dt_smote_boruta_precision_score = precision_score(smote_train_labels_boruta, dt_smote_boruta_prediction)
dt_smote_boruta_recall_score = recall_score(smote_train_labels_boruta, dt_smote_boruta_prediction)
dt_smote_boruta_accuracy_score = accuracy_score(smote_train_labels_boruta, dt_smote_boruta_prediction)
print("precision of decision tree model on balanced sampling and boruta dataset is + %f" % dt_smote_boruta_precision_score)
print("recall of decision tree model on balanced and boruta dataset is + %f" % dt_smote_boruta_recall_score)
print("accuracy of decision tree model on balanced and boruta dataset is + %f" % dt_smote_boruta_accuracy_score)

precision of decision tree model on balanced sampling and boruta dataset is + 0.992603
recall of decision tree model on balanced and boruta dataset is + 0.996760
accuracy of decision tree model on balanced and boruta dataset is + 0.994748


In [27]:
#train the model against smote_train_set_l1, smote_train_labels_l1
dt_smote_l1_prediction = cross_val_predict(dt_clf.fit(smote_train_set_l1, smote_train_labels_l1), 
                                                   smote_train_set_l1, smote_train_labels_l1, cv=10)

#calculate the presion and recall of dt_smote_l1_prediction
dt_smote_l1_precision_score = precision_score(smote_train_labels_l1, dt_smote_l1_prediction)
dt_smote_l1_recall_score = recall_score(smote_train_labels_l1, dt_smote_l1_prediction)
dt_smote_l1_accuracy_score = accuracy_score(smote_train_labels_l1, dt_smote_l1_prediction)
print("precision of decision tree model on balanced sampling and L1-based dataset is + %f" % dt_smote_l1_precision_score)
print("recall of decision tree model on balanced sampling and L1-based dataset is + %f" % dt_smote_l1_recall_score)
print("accuracy of decision tree model on balanced sampling and L1-based dataset is + %f" % dt_smote_l1_accuracy_score)

precision of decision tree model on balanced sampling and L1-based dataset is + 0.990828
recall of decision tree model on balanced sampling and L1-based dataset is + 0.996615
accuracy of decision tree model on balanced sampling and L1-based dataset is + 0.993786


In [28]:
#train the against smote_train_set_tr, smote_train_labels_tr
dt_smote_tr_prediction = cross_val_predict(dt_clf.fit(smote_train_set_tr, 
                                                             smote_train_labels_tr), 
                                                  smote_train_set_tr, smote_train_labels_tr, cv=10)

#calculate the presion and recall of dt_smote_tr_prediction
dt_smote_tr_precision_score = precision_score(smote_train_labels_tr, dt_smote_tr_prediction)
dt_smote_tr_recall_score = recall_score(smote_train_labels_tr, dt_smote_tr_prediction)
dt_smote_tr_accuracy_score = accuracy_score(smote_train_labels_tr, dt_smote_tr_prediction)
print("precision of decision tree model on balanced sampling and tree-based dataset is + %f" % dt_smote_tr_precision_score)
print("recall of decision tree model on balanced sampling and tree-based dataset is + %f" % dt_smote_tr_recall_score)
print("accuracy of decision tree model on balanced sampling and tree-based dataset is + %f" % dt_smote_tr_accuracy_score)

precision of decision tree model on balanced sampling and tree-based dataset is + 0.991636
recall of decision tree model on balanced sampling and tree-based dataset is + 0.995027
accuracy of decision tree model on balanced sampling and tree-based dataset is + 0.993508


### Linear models

In [0]:
from sklearn.svm import SVC
le_clf = SVC(gamma='auto')

In [0]:
#train the model against the original dataset
le_or_prediction = cross_val_predict(le_clf.fit(train_set, train_labels), 
                                           train_set, train_labels, cv=10)
#calculate the presion and recall of original dataset
le_or_precision_score = precision_score(train_labels, dt_or_prediction)
le_or_recall_score = recall_score(train_labels, dt_or_prediction)
le_or_accuracy_score = accuracy_score(train_labels, dt_or_prediction)
print("precision of linear model on original dataset is + %f" % le_or_precision_score)
print("recall of linear tree model on original dataset is + %f" % le_or_recall_score)
print("accuracy of linear tree model on original dataset is + %f" % le_or_accuracy_score)