### Libraries

In [22]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

### Data Loading

In [23]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

data = data.dropna(axis=0) #Drop observations/rows with missing values
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
cols_cat = list(data.dtypes[data.dtypes=="object"].index) #liste des feature categorical

In [3]:
data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [4]:
X.describe()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy
count,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974
std,0.0,123306.8,123122.1,0.732924
min,256.0,-1000000.0,2.0,0.0
25%,256.0,-50.0,275.0,2.0
50%,256.0,1000.0,1000.0,2.0
75%,256.0,2800.0,5000.0,2.0
max,256.0,9880000.0,9880000.0,4.0


In [5]:
X.describe(include=object, exclude=[np.int64, np.float64])

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,ProviderId,ProductId,ProductCategory,ChannelId,TransactionStartTime
count,95662,95662,95662,95662,95662,95662,95662,95662,95662,95662,95662
unique,95662,94809,3633,3627,3742,1,6,23,9,4,94556
top,TransactionId_76871,BatchId_67019,AccountId_4841,SubscriptionId_3829,CustomerId_7343,UGX,ProviderId_4,ProductId_6,financial_services,ChannelId_3,2018-12-24T16:30:13Z
freq,1,28,30893,32630,4091,95662,38189,32635,45405,56935,17


# First Model
Very simple model with no feature engineering to have a reference of performance when we improve out model. The missing values are droped for simplicity. At first, the categorical values are also droped.

In [30]:
train_y1 = train_y.copy()
train_X1 = train_X.copy()
val_y1 = val_y.copy()
val_X1 = val_X.copy()
train_X1.drop(cols_cat, axis=1, inplace=True) #Delete categorical data
val_X1.drop(cols_cat, axis=1, inplace=True) #Delete categorical data
train_X1.head()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy
43878,256,2500.0,2500,2
55031,256,1000.0,1000,2
28483,256,2000.0,2000,2
7328,256,-1000.0,1000,2
18860,256,-5000.0,5000,2


In [31]:
first_model_decision_tree = DecisionTreeRegressor(random_state=1)
first_model_decision_tree.fit(train_X1, train_y1)
predicted_first_model = first_model_decision_tree.predict(val_X1)

df_with_res = val_X1.copy()
df_with_res = df_with_res.join(val_y1.copy())
df_with_res["Predictions"] = list(predicted_first_model)
print(mean_absolute_error(val_y, predicted_first_model))
df_with_res.head(10)

0.0007605343032114977


Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult,Predictions
69075,256,2000.0,2000,4,0,0.0
65709,256,1110.0,1110,2,0,0.0
5430,256,2000.0,2000,2,0,0.0
82375,256,-1000.0,1000,2,0,0.0
60896,256,1000.0,1000,2,0,0.0
6835,256,-496.0,496,2,0,0.0
7384,256,1500.0,1500,2,0,0.0
71953,256,7000.0,7000,2,0,0.0
5423,256,1000.0,1000,2,0,0.0
49255,256,1000.0,1000,2,0,0.0


## Random Forest Regressor

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X1, train_y1)
preds = forest_model.predict(val_X1)
print(mean_absolute_error(val_y1, preds))

0.0007375730455504398
