In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plpt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

In [2]:
def replace_negative_values_with_median(df):
  cols_with_negative_values = [col for col in df.columns if (df[col] < 0).any()]
  for col in cols_with_negative_values:
    median = df[col].median()
    df[col] = df[col].apply(lambda x: round(median) if x < 0 else x)
  return df

In [3]:
df1 = pd.read_csv('../Dataset/dataset_1.csv',encoding='utf-8', engine='python')

In [4]:
df1.head()

In [5]:
# removing all the column that has same values in all the rows
nunique = df1.nunique()
cols_to_drop = nunique[nunique == 1].index
df1 = df1.drop(cols_to_drop, axis=1)

In [6]:
df1 = replace_negative_values_with_median(df1)

In [7]:
df1.info()

In [20]:
df2 = pd.read_csv('../Dataset/dataset_2.csv',encoding='utf-8', engine='python')

In [21]:
df2.head()

In [22]:
# removing all the column that has same values in all the rows
nunique = df2.nunique()
cols_to_drop = nunique[nunique == 1].index
df2 = df2.drop(cols_to_drop, axis=1)

In [23]:
df2 = df2.drop(labels=['url'], axis=1)
df2['status'] = df2['status'].map({'phishing': 1, 'legitimate': 0})

In [24]:
df2 = replace_negative_values_with_median(df2)

In [25]:
df2.info()

In [28]:
# this function is only for dataset 1
sm = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)
sm_enn = SMOTEENN(random_state=42)
def different_train_test_1(partition_name,dataset_idx,df,ratio):
    train, test = train_test_split(df, test_size=ratio)
    test.to_csv('../Dataset/Testing/Dataset'+dataset_idx+'/'+partition_name+'_test.csv', index = None)
    X_train_oversampled, Y_train_oversampled = sm.fit_resample(train.drop(labels=[train.columns[-1]], axis=1),train[train.columns[-1]])
    X_train_oversampled['phishing'] = Y_train_oversampled
    X_train_oversampled.to_csv('../Dataset/Training/Dataset'+dataset_idx+'/Oversampled/'+partition_name+'_train.csv', index = None)
    print(partition_name,'Oversampled '+'training dataset'+dataset_idx+'\n',X_train_oversampled[X_train_oversampled.columns[-1]].value_counts())

    X_train_undersampled, Y_train_undersampled = rus.fit_resample(train.drop(labels=[train.columns[-1]], axis=1),train[train.columns[-1]])
    X_train_undersampled['phishing'] = Y_train_undersampled
    X_train_undersampled.to_csv('../Dataset/Training/Dataset'+dataset_idx+'/Undersampled/'+partition_name+'_train.csv', index = None)
    print(partition_name,'Undersampled '+'training dataset'+dataset_idx+'\n',X_train_undersampled[X_train_undersampled.columns[-1]].value_counts())

    X_train_over_undersampled, Y_train_over_undersampled = sm_enn.fit_resample(train.drop(labels=[train.columns[-1]], axis=1),train[train.columns[-1]])
    X_train_over_undersampled['phishing'] = Y_train_over_undersampled
    X_train_over_undersampled.to_csv('../Dataset/Training/Dataset'+dataset_idx+'/Over_Undersampled/'+partition_name+'_train.csv', index = None)
    print(partition_name,'Over_Undersampled '+'training dataset'+dataset_idx+'\n',X_train_over_undersampled[X_train_over_undersampled.columns[-1]].value_counts())

In [29]:
different_train_test_1('80_20','1',df1,0.2)
different_train_test_1('75_25','1',df1,0.25)
different_train_test_1('70_30','1',df1,0.3)
different_train_test_1('65_35','1',df1,0.35)
different_train_test_1('60_40','1',df1,0.4)

In [30]:
# this function will  work for any normal and balanced dataset
def different_train_test_2(partition_name,dataset_idx,df,ratio):
    train, test = train_test_split(df, test_size=ratio)
    test.to_csv('../Dataset/Testing/Dataset'+dataset_idx+'/'+partition_name+'_test.csv', index = None)
    train.to_csv('../Dataset/Training/Dataset'+dataset_idx+'/'+partition_name+'_train.csv', index = None)

In [32]:
different_train_test_2('80_20','2',df2,0.2)
different_train_test_2('75_25','2',df2,0.25)
different_train_test_2('70_30','2',df2,0.3)
different_train_test_2('65_35','2',df2,0.35)
different_train_test_2('60_40','2',df2,0.4)