## 0. Import libraries

In [32]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, classification_report

## 1. Data preparation

In [33]:
train_features = pd.read_csv('../datasets/train_features.csv')
test_features = pd.read_csv('../datasets/test_features.csv')

train_features.drop(['Unnamed: 0'], axis=1, inplace=True)
test_features.drop(['Unnamed: 0'], axis=1, inplace=True)

In [34]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   age          1296675 non-null  float64
 1   amount(usd)  1296675 non-null  float64
 2   hour_of_day  1296675 non-null  int64  
 3   category     1296675 non-null  object 
 4   merchant     1296675 non-null  object 
 5   state        1296675 non-null  object 
 6   city_pop     1296675 non-null  int64  
 7   job          1296675 non-null  object 
 8   is_fraud     1296675 non-null  int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 89.0+ MB


In [35]:
test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   age          555719 non-null  float64
 1   amount(usd)  555719 non-null  float64
 2   hour_of_day  555719 non-null  int64  
 3   category     555719 non-null  object 
 4   merchant     555719 non-null  object 
 5   state        555719 non-null  object 
 6   city_pop     555719 non-null  int64  
 7   job          555719 non-null  object 
 8   is_fraud     555719 non-null  int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 38.2+ MB


## 2. Model preparation

### 2.1 Categorial Encoding

In [36]:
categories = ['category', 'merchant', 'state', 'job']

#### 2.1.1 Ordinal Encoding

In [37]:
# from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder(dtype=np.int64)
#
# ordinal_encoder.fit(train_features.loc[:, categories])
# train_features.loc[:, categories] = ordinal_encoder.transform(train_features[categories])
#
# ordinal_encoder.fit(test_features.loc[:, categories])
# test_features.loc[:, categories] = ordinal_encoder.transform(test_features[categories])
#
# train_features.head()

#### 2.1.2 Label Encoding

In [38]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for category in categories:
    train_features[category] = label_encoder.fit_transform(train_features[category])
    test_features[category] = label_encoder.fit_transform(test_features[category])

train_features.head()

Unnamed: 0,age,amount(usd),hour_of_day,category,merchant,state,city_pop,job,is_fraud
0,35.0,4.97,0,8,514,27,3495,370,0
1,44.0,107.23,0,4,241,47,149,428,0
2,61.0,220.11,0,0,390,13,4154,307,0
3,56.0,45.0,0,2,360,26,1939,328,0
4,37.0,41.96,0,9,297,45,99,116,0


#### 2.1.3 Count Encoding

In [39]:
# from category_encoders import CountEncoder
# count_encoder = CountEncoder()
#
# for category in categories:
#     train_features[category] = count_encoder.fit_transform(train_features[category])
#     test_features[category] = count_encoder.fit_transform(test_features[category])
#
# train_features.head()

#### 2.1.4 One-Hot Encoding

It is too long working

In [40]:
# train_features = pd.get_dummies(train_features, columns=categories)
# test_features = pd.get_dummies(test_features, columns=categories)
# train_features

### 2.2 Train-test split

In [41]:
y_train = train_features['is_fraud'].values
X_train = train_features.drop(['is_fraud'], axis='columns').values

y_test = test_features['is_fraud'].values
X_test = test_features.drop(['is_fraud'], axis='columns').values

In [42]:
print('y_train:   ', len(y_train))
print('X_train:   ', len(X_train))
print()
print('y_test:   ', len(y_test))
print('X_test:   ', len(X_test))


y_train:    1296675
X_train:    1296675

y_test:    555719
X_test:    555719


### 2.3. Solving disbalance

#### 2.3.1 SMOTE

In [43]:
from imblearn.over_sampling import SMOTE

method = SMOTE()
X_train_resampled, y_train_resampled = method.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = method.fit_resample(X_test, y_test)

# X_train_resampled = X_train
# y_train_resampled = y_train

In [44]:
print(len(y_train_resampled[y_train_resampled == 0]))
print(len(y_train_resampled[y_train_resampled == 1]))
print(len(X_train_resampled))
print('X_resampled:\t', len(X_train_resampled))
print('y_resampled:\t', len(y_train_resampled))

1289169
1289169
2578338
X_resampled:	 2578338
y_resampled:	 2578338


#### 2.3.2 ADASYN

In [45]:
# from imblearn.over_sampling import ADASYN
# method = ADASYN()
# X_train_resampled, y_train_resampled = method.fit_resample(X_train, y_train)
#
# print(len(y_train_resampled[y_train_resampled == 0]))
# print(len(y_train_resampled[y_train_resampled == 1]))
# print(len(X_train_resampled))
# print('X_resampled:\t', len(X_train_resampled))
# print('y_resampled:\t', len(y_train_resampled))

### 3. Logistic regression modeling

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

In [48]:
predict = model.predict(X_test)

In [49]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=predict)
print(conf_mat)

[[519559  34015]
 [   533   1612]]


<img src="../images/confusion_matrix.jpeg">

In [50]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97    553574
           1       0.05      0.75      0.09      2145

    accuracy                           0.94    555719
   macro avg       0.52      0.85      0.53    555719
weighted avg       1.00      0.94      0.96    555719



In [51]:
from sklearn.metrics import roc_auc_score, fbeta_score
print('ROC-AUC:\t', roc_auc_score(y_test, predict))
print('F-beta:\t', fbeta_score(y_test, predict, beta=1))

ROC-AUC:	 0.8450344926648004
F-beta:	 0.08535423064704013


### 4. SVC modeling

In [52]:
# from sklearn.svm import SVC
#
# svc_model = SVC()
# svc_model.fit(X_train_resampled, y_train_resampled)
#
# svc_predict = svc_model.predict(X_test)
# print(confusion_matrix(y_test, svc_predict))
# print(classification_report(y_test, svc_predict))
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))


### 5. Random forest

In [53]:
# from sklearn.ensemble import RandomForestClassifier
# model2 = RandomForestClassifier(random_state=5)
# model2.fit(X_train_resampled,y_train_resampled)

In [54]:
# predicted=model2.predict(X_test)
# print('Classification report:\n', classification_report(y_test, predicted))
# conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
# print('Confusion matrix:\n', conf_mat)
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))
# print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

### 4. Deсision Trees

In [55]:
# from sklearn.tree import DecisionTreeClassifier
#
# clf_model = DecisionTreeClassifier(random_state=42)
# clf_model.fit(X_train_resampled, y_train_resampled)

In [56]:
# predict = clf_model.predict(X_test)
# print('Classification report:\n', classification_report(y_test, predict))
# conf_mat = confusion_matrix(y_true=y_test, y_pred=predict)
# print('Confusion matrix:\n', conf_mat)
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))

### 5. KNN

In [57]:
# from sklearn.neighbors import KNeighborsClassifier
#
# clf = KNeighborsClassifier(n_neighbors=100)
# clf.fit(X_train_resampled, y_train_resampled)

In [58]:
# predict = clf.predict(X_test)
# print(classification_report(y_test, predict))
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))

### 6. Neural Network

In [59]:
# from sklearn.neural_network import MLPClassifier
# mlp_model = MLPClassifier(alpha = 1e-5, hidden_layer_sizes=(5, 5), random_state=1)
# mlp_model.fit(X_train_resampled, y_train_resampled)

In [60]:
# mlp_pred = mlp_model.predict(X_test)
# print(classification_report(y_test, mlp_pred))
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))

### 7. Gradient boosting

In [61]:
# from sklearn.ensemble import GradientBoostingClassifier
#
# gb = GradientBoostingClassifier()
# gb.fit(X_train_resampled, y_train_resampled)

In [62]:
# predict = gb.predict(X_test)
# print(classification_report(y_test, predict))
# print('ROC-AUC:\t', roc_auc_score(y_test, predict))