<a href="https://colab.research.google.com/github/Tikquuss/C_FDEGCC/blob/main/CLI_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/Tikquuss/C_FDEGCC

In [None]:
%cd C_FDEGCC

In [None]:
from main import Transformer, RNN, SigmoidModel, process_frame, FDDataset, FDDataset4Test, train, setting, get_upsampled, get_undersampled

from sklearn.model_selection import train_test_split
import pandas as pd



---
**Make sure you have a data file in /content structured like this (Here I load them from my drive):**
```
/content
	/data
		/train
			/client_train.csv
			/invoice_train.csv
		/test
			/client_test.csv
			/invoice_test.csv
```



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! rsync -av --progress --exclude *.zip /content/drive/MyDrive/"Fraud Detection in Electricity and Gas Consumption"/data /content

# Data

In [None]:
%cd /content

### Training and validation data

In [None]:
client_df = pd.read_csv("data/train/client_train.csv")
invoice_df = pd.read_csv("data/train/invoice_train.csv")

In [None]:
client_df, invoice_df= process_frame(client_df, invoice_df)

In [None]:
dataset = FDDataset(client_df, invoice_df)

In [None]:
dataset.batch_size = 8
x1, x2, y, seq_lens = next(iter(dataset))
x1.shape, x2.shape, y.shape, seq_lens

In [None]:
random_seed = 0
val_size = 0.2
train_data, val_data = train_test_split(dataset.data, test_size = val_size, random_state=random_seed)

### test data

In [None]:
test_client_df = pd.read_csv("data/test/client_test.csv")
test_invoice_df = pd.read_csv("data/test/invoice_test.csv")

In [None]:
test_client_df, test_invoice_df= process_frame(test_client_df, test_invoice_df)

In [None]:
test_dataset = FDDataset4Test(test_client_df, test_invoice_df)

# Model
**We use option 2 (Transformer) and `type_ = 0` for our Leaderbord submission (Zindi)**

## option1 : RNN/LSTM

In [None]:
model_class = RNN
variant = "LSTM" # or "RNN"
model_kwargs = {
    "x1_dim" : 6, "x2_dim" : 17, "hidden_dim" :100, "output_dim":2, "dropout": 0.1, 
    "variant":variant, "num_layers":2, "bidirectional": False
}

## option2 : Transformer

In [None]:
model_class = Transformer
model_kwargs = {
    "x1_dim" : 6, "x2_dim" : 17, "output_dim" : 2, "d_model" : 4*32, "num_heads" : 4, "d_k" : None, "d_v" : None, 
    "num_encoder_layers": 3, "dim_feedforward" : 100, "dropout" : 0.1, "activation" : "relu"
}

## softmax or sigmoid
- if `type_ = 0`, he output dimension is $2$, and a `softmax` is applied to the output of the model to transform it into prabability ($[p, 1-p]$), and the final output is worth the index ($0$ or $1$) of the maximum probability.

- if `type_ = 1`, The output dimension is $1$, and a `sigmoid` is applied to the output of the model to transform them into prabability ($p$), and the final output is worth $0$ or $1$ depending on the probability with respect to 0.5.

In [None]:
type_ = 0

# prepare data
**We use option 1 (upsampling) and ```normalize = False``` for our Leaderbord submission (Zindi)**

## option 1 : **upsample minority**

In [None]:
sampled = get_upsampled(train_data)

## option 1 : **downsample majority**

In [None]:
sampled = get_undersampled(train_data)

## .

In [None]:
len(sampled)

## normalize 
If `normalize = True`, batch normalization will be applied to the data before it is supplied to the model.

In [None]:
normalize = False

# Train

In [None]:
batch_size = 128
train_dataset = FDDataset(data = sampled, batch_size = batch_size, normalize = normalize)
val_dataset = FDDataset(data = val_data, batch_size = batch_size, normalize = normalize)

In [None]:
lr = 3e-6
n_epochs = 100
save_path = "/content/model%d.pth"%type_

In [None]:
model, optimizer, criterion, device = setting(model_class, lr, type_, model_kwargs)

In [None]:
model = train(model, optimizer, criterion, train_dataset, val_dataset, device, n_epochs = n_epochs, type_ = type_, save_path = save_path)

# Test (create file for sumbmission)

In [None]:
prob_csv_file =  "/content/prob_csv_file%d.csv"%type_
pred_csv_file =  "/content/pred_csv_file%d.csv"%type_

In [None]:
test_dataset.batch_size = 128
test_dataset.run_test(model, device, prob_csv_file, pred_csv_file, type_)