In [43]:
import pandas as pd
import numpy as np
from os.path import join

In [42]:
RAW_DATA_PATH = r"C:\Users\shami\OneDrive\Desktop\Master_Thesis_Data\Raw_Data"
PREPARED_DATA_PATH = r"C:\Users\shami\OneDrive\Desktop\Master_Thesis_Data\Prepared_Datasets"
TEST_DATA_PATH = r"C:\Users\shami\OneDrive\Desktop\Master_Thesis_Data\Test_Datasets"
TRAIN_DATA_PATH = r"C:\Users\shami\OneDrive\Desktop\Master_Thesis_Data\Train_Datasets"

## Credit Card Fraud Data

In [44]:
cc_fraud = pd.read_csv(join(RAW_DATA_PATH,"creditcard.csv"))

In [45]:
cc_fraud.Class.value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [13]:
def resample_N(percent, min_class_count):
    return int(((min_class_count*100)/percent)-min_class_count)

In [27]:
cc_fraud_4 = pd.concat(
          [cc_fraud[cc_fraud.Class==0].sample(n=resample_N(4,492), random_state=42),
           cc_fraud[cc_fraud.Class==1]
          ]
)

In [28]:
cc_fraud_3 = pd.concat(
          [cc_fraud[cc_fraud.Class==0].sample(n=resample_N(3,492), random_state=42),
           cc_fraud[cc_fraud.Class==1]
          ]
)

In [29]:
cc_fraud_2 = pd.concat(
          [cc_fraud[cc_fraud.Class==0].sample(n=resample_N(2,492), random_state=42),
           cc_fraud[cc_fraud.Class==1]
          ]
)

In [30]:
cc_fraud_1 = pd.concat(
          [cc_fraud[cc_fraud.Class==0].sample(n=resample_N(1,492), random_state=42),
           cc_fraud[cc_fraud.Class==1]
          ]
)

In [31]:
cc_fraud_05 = pd.concat(
          [cc_fraud[cc_fraud.Class==0].sample(n=resample_N(0.5,492), random_state=42),
           cc_fraud[cc_fraud.Class==1]
          ]
)

In [46]:
cc_fraud_4.to_excel(join(PREPARED_DATA_PATH,"cc_fraud_4.xlsx"), index=False)
cc_fraud_3.to_excel(join(PREPARED_DATA_PATH,"cc_fraud_3.xlsx"), index=False)
cc_fraud_2.to_excel(join(PREPARED_DATA_PATH,"cc_fraud_2.xlsx"), index=False)
cc_fraud_1.to_excel(join(PREPARED_DATA_PATH,"cc_fraud_1.xlsx"), index=False)
cc_fraud_05.to_excel(join(PREPARED_DATA_PATH,"cc_fraud_05.xlsx"), index=False)

In [47]:
cc_fraud_2.shape

(24600, 31)

In [48]:
from sklearn.model_selection import train_test_split 

In [49]:
cc_fraud_2_train, cc_fraud_2_test = train_test_split(cc_fraud_2, random_state=42,  
                                                    test_size=0.1, shuffle=True)

In [53]:
cc_fraud_2_train.Class.value_counts()/len(cc_fraud_2_train)

Class
0    0.97972
1    0.02028
Name: count, dtype: float64

In [54]:
cc_fraud_2_test.Class.value_counts()/len(cc_fraud_2_train)

Class
0    0.109169
1    0.001942
Name: count, dtype: float64

In [55]:
cc_fraud_2_test.to_excel(join(TEST_DATA_PATH,"cc_fraud_2_test.xlsx"), index=False)
cc_fraud_2_train.to_excel(join(TRAIN_DATA_PATH,"cc_fraud_2_train.xlsx"), index=False)

## Crime Data

In [4]:
from imblearn.datasets import fetch_datasets

# Fetch dataset from imbalanced-learn library 
 # as a dictionary of numpy array
us_crime = fetch_datasets()['us_crime']

In [5]:
# Convert the dictionary to a pandas dataframe
crime_df = pd.concat([pd.DataFrame(us_crime['data'], columns = [f'data_{i}' for i in range(us_crime.data.shape[1])]),
           pd.DataFrame(us_crime['target'], columns = ['target'])], axis = 1)

In [6]:
crime_df

Unnamed: 0,data_0,data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,data_9,...,data_91,data_92,data_93,data_94,data_95,data_96,data_97,data_98,data_99,target
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,-1
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,1
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,-1
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,-1
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,0.22,0.28,0.34,0.48,0.39,0.01,0.28,0.05,0.00,-1
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,0.53,0.25,0.17,0.10,0.00,0.02,0.37,0.20,0.00,-1
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.25,0.68,0.61,0.79,0.76,0.08,0.32,0.18,0.91,-1
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.45,0.64,0.54,0.59,0.52,0.03,0.38,0.33,0.22,-1
