## 0. Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

## 1. Data preparation

In [2]:
train_features = pd.read_csv('../datasets/train_features.csv')
test_features = pd.read_csv('../datasets/test_features.csv')

train_features.drop(['Unnamed: 0'], axis=1, inplace=True)
test_features.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
#train_features.info()

In [4]:
#test_features.info()

## 2. Model preparation

### 2.1 Categorial Encoding

In [5]:
categories = ['category', 'merchant', 'state', 'job']

#### 2.1.1 Ordinal Encoding

In [6]:
# from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder(dtype=np.int64)
#
# ordinal_encoder.fit(train_features.loc[:, categories])
# train_features.loc[:, categories] = ordinal_encoder.transform(train_features[categories])
#
# ordinal_encoder.fit(test_features.loc[:, categories])
# test_features.loc[:, categories] = ordinal_encoder.transform(test_features[categories])
#
# train_features.head()

#### 2.1.2 Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for category in categories:
    train_features[category] = label_encoder.fit_transform(train_features[category])
    test_features[category] = label_encoder.fit_transform(test_features[category])

train_features.head()

Unnamed: 0,age,amount(usd),hour_of_day,category,merchant,state,city_pop,job,is_fraud
0,35.0,4.97,0,8,514,27,3495,370,0
1,44.0,107.23,0,4,241,47,149,428,0
2,61.0,220.11,0,0,390,13,4154,307,0
3,56.0,45.0,0,2,360,26,1939,328,0
4,37.0,41.96,0,9,297,45,99,116,0


#### 2.1.3 Count Encoding

In [8]:
# from category_encoders import CountEncoder
# count_encoder = CountEncoder()
#
# for category in categories:
#     train_features[category] = count_encoder.fit_transform(train_features[category])
#     test_features[category] = count_encoder.fit_transform(test_features[category])
#
# train_features.head()

#### 2.1.4 One-Hot Encoding

It is too long working

In [9]:
# train_features = pd.get_dummies(train_features, columns=categories)
# test_features = pd.get_dummies(test_features, columns=categories)
# train_features

### 2.2 Train-test split

In [10]:
y_train = train_features['is_fraud'].values
X_train = train_features.drop(['is_fraud'], axis='columns').values

y_test = test_features['is_fraud'].values
X_test = test_features.drop(['is_fraud'], axis='columns').values

In [11]:
print('y_train:   ', len(y_train))
print('X_train:   ', len(X_train))
print()
print('y_test:   ', len(y_test))
print('X_test:   ', len(X_test))


y_train:    1296675
X_train:    1296675

y_test:    555719
X_test:    555719


### 2.3. Solving disbalance

#### 2.3.1 SMOTE

In [12]:
from imblearn.over_sampling import SMOTE

method = SMOTE()
X_train_resampled, y_train_resampled = method.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = method.fit_resample(X_test, y_test)

# X_train_resampled = X_train
# y_train_resampled = y_train

In [13]:
print(len(y_train_resampled[y_train_resampled == 0]))
print(len(y_train_resampled[y_train_resampled == 1]))
print(len(X_train_resampled))
print('X_resampled:\t', len(X_train_resampled))
print('y_resampled:\t', len(y_train_resampled))

1289169
1289169
2578338
X_resampled:	 2578338
y_resampled:	 2578338


#### 2.3.2 ADASYN

In [14]:
# from imblearn.over_sampling import ADASYN
# method = ADASYN()
# X_train_resampled, y_train_resampled = method.fit_resample(X_train, y_train)
#
# print(len(y_train_resampled[y_train_resampled == 0]))
# print(len(y_train_resampled[y_train_resampled == 1]))
# print(len(X_train_resampled))
# print('X_resampled:\t', len(X_train_resampled))
# print('y_resampled:\t', len(y_train_resampled))

### 5. Random forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
model2 = RandomForestClassifier(random_state=5)
model2.fit(X_train_resampled,y_train_resampled)

In [17]:
predict=model2.predict(X_test)

In [18]:
print('>>> Confusion matrix:\n', confusion_matrix(y_test, predict), end='\n\n')
print('>>> Classification report:\n', classification_report(y_test, predict), end='\n\n')
print('>>> ROC-AUC:\t', roc_auc_score(y_test, predict))
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

>>> Confusion matrix:
 [[553084    490]
 [   457   1688]]

>>> Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.78      0.79      0.78      2145

    accuracy                           1.00    555719
   macro avg       0.89      0.89      0.89    555719
weighted avg       1.00      1.00      1.00    555719


>>> ROC-AUC:	 0.8930306148838811
Share of Non-Fraud in Test Data: 0.9961
