# Second models
We are now trying to add the categorical features. There is no ordinal feature in the dataset, we will the use the One-Hot Encoding method to deal with it. Here is the basis on how the data are transformed : 
- Deleting the columns ['CurrencyCode', 'CountryCode'] because it only has one value and won't have influence on the target. 
- Set the index to the id of the TransactionId column
- Adding some data related to the date. We created the columns from the TransactionStartTime : the day, the hour, the day of the week, and the week of the month. After evaluation of the MI, we considered that the seconds and minutes of the transaction were did not have a lot of influence of the target and will not be considered to not add too much new features. 
- Drop the column datetime because the models don't takes this format
- Create the One-Hot-Encoding on the low cardinality columns : ['ProviderId', 'ProductCategory', 'ChannelId', 'PricingStrategy']
- Transforming the string columns into int

In [122]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import datetime as dt
from math import ceil
from Utils import *
pd.set_option('max_colwidth', None)
from EvaluationMetric import *

In [123]:
raw_data.PricingStrategy.value_counts()

2    79848
4    13562
1     1867
0      385
Name: PricingStrategy, dtype: int64

In [124]:
def transactioId_to_index(data):
    data[['dc', 'new_index']] = data.TransactionId.str.split("_", expand = True)
    data.drop(['dc','TransactionId'], inplace=True, axis=1)
    data.rename(columns={"new_index": "TransactionId"}, inplace=True)
    data.set_index('TransactionId', inplace=True)
    return data

In [125]:
def adding_date_col(X, date_col):
    day = [i.day for i in X[date_col]]
    hour = [i.hour for i in X[date_col]]
    l_week = [week_of_month(x) for x in X[date_col].tolist()]
    X['Day'] = day
    X['Hour'] = hour
    X['week_day'] = X[date_col].dt.dayofweek
    X["weeks"] = l_week
    return X

In [126]:
def added_column(describe):
    """
    Function to add information on  new columns and keep the describe's df updated
    If there is a missing data in one column, replace it with : np.nan
    """
    new_col = [
        ["Day", "The day in the month of the transaction", "int64", np.nan],
        ["Hour", "The hour of the transaction", "int64", np.nan],
        ["week_day", "The day of the week of the transaction", "int64", np.nan],
        ["weeks", "The week of the month of the transaction", "int64", np.nan]
    ]
    df1 = pd.DataFrame(new_col, columns=['Column Name', 'Definition', "Dtype", "unique"])
    describe = pd.concat([describe, df1])
    describe.reset_index(inplace=True)
    describe.drop(['index'], axis=1, inplace=True) #Only the features data
    return describe

In [127]:
def getscoreforcsv(index_val, preds_val, name_file = "resultsfile.csv"):
    data = {'TransactionId': index_val,
        'FraudResult': preds_val}

    df = pd.DataFrame(data)

    df.to_csv(name_file, index=False)
    print("done")
    return df

## 1. With OHE and without TransactionStartTime column

In [128]:
# data cleaning
#Data loading
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)
        
medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number

l_col_str = ["BatchId", "AccountId", "SubscriptionId", "CustomerId", "ProviderId", "ProductId", "ChannelId"]
for col in l_col_str:
    data[['dc', 'new_col']] = data[col].str.split("_", expand = True)
    data.drop(['dc',col], inplace=True, axis=1)
    data.rename(columns={"new_col": col}, inplace=True)
    data[col] = data[col].astype('int')
    X_test[['dc', 'new_col']] = X_test[col].str.split("_", expand = True)
    X_test.drop(['dc',col], inplace=True, axis=1)
    X_test.rename(columns={"new_col": col}, inplace=True)
    X_test[col] = X_test[col].astype('int')
data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

#Other data
#Information on columns on raw_data
## P-e transformer ca en une fonction ? 
info = pd.DataFrame(data = raw_data.dtypes)
info.reset_index(inplace=True)
info.rename({'index':'Column Name', 0: 'Dtype'}, axis=1, inplace=True)
describe = def_feature.copy()
describe = describe.merge(info)
unique_val = []
for col in list(describe["Column Name"]) : 
    unique_val.append(len(raw_data[col].unique()))
describe["unique"]=unique_val
describe = added_column(describe)
cat_cols = [col for col in train_X.columns if train_X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie

#print(describe)


#low_cardinality_cols=["ProviderId", "ProductCategory", "ChannelId", "PricingStrategy"] 
#train_X[low_cardinality_cols] = train_X[low_cardinality_cols].astype(str) 
#val_X[low_cardinality_cols] = val_X[low_cardinality_cols].astype(str) 

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
#print(f'Col avant : {train_X.columns}')
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)
#print(f'col num : {num_X_train.columns}')

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
#print(f'col apres concat : {OH_X_train.columns}')
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid

In [129]:
train_X.head()

Unnamed: 0_level_0,Amount,Value,Day,Hour,week_day,weeks,BatchId,AccountId,SubscriptionId,CustomerId,...,13,14,15,16,17,18,19,20,21,22
TransactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
82059,2500.0,2500.0,28,10,4,4,48408,2946,3991,3371,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
115282,1000.0,1000.0,9,7,2,2,5122,4468,3787,4928,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
36103,2000.0,2000.0,15,15,5,2,117608,3750,3024,4193,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5655,-1000.0,1000.0,24,8,5,4,75137,4841,3829,4371,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
87110,-5000.0,5000.0,7,6,4,1,84248,4841,3829,2710,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [130]:
train_X.columns

Index(['Amount', 'Value', 'Day', 'Hour', 'week_day', 'weeks', 'BatchId',
       'AccountId', 'SubscriptionId', 'CustomerId', 'ProductId', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15',
       '16', '17', '18', '19', '20', '21', '22'],
      dtype='object')

## Try on models
### Decision Tree Classifier

In [131]:
model = DecisionTreeClassifier(random_state=1)
model.fit(train_X, train_y)
model_pred = model.predict(val_X)
model_test = model.predict(OH_X_test)
df = getscoreforcsv(index_val, model_test)
report = report(val_y, model_pred, "Decision Tree Classifier", "with Categories OHE without DateTime", csvw = False)
report['PublicScore'] = 0.59375
report['PrivateScore'] = 0.590909090
report

done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore
0,Decision Tree Classifier,with Categories OHE without DateTime,2023-04-29 17:15:55.932607,0.730769,0.95,0.826087,0.024113,0.8329,0.59375,0.590909


In [132]:
def test_leaf(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    description = "max_leaf_nodes : "+str(max_leaf_nodes)
    metrics = report(val_y, preds_val, "Decision Tree Classifier", description, csvw=False)
    return(metrics)


results = pd.DataFrame()
result_tree = report
for max_leaf_nodes in [3, 4, 5, 6, 7, 8, 9, 10]:
    my_metrics_tree = test_leaf(max_leaf_nodes, train_X, val_X, train_y, val_y)
    results = pd.concat([results,my_metrics_tree], ignore_index=True)
    result_tree = pd.concat([result_tree,my_metrics_tree], ignore_index=True)
result_tree

TypeError: 'DataFrame' object is not callable

In [133]:
testcode()

NameError: name 'testcode' is not defined

In [None]:
report

![image.png](attachment:image.png)
This one is worst with category than without

## 2. With OHE and without TransactionStartTime column or date columns

In [None]:
# data cleaning
#Data loading
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)
        
medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
#data = adding_date_col(data, 'TransactionStartTime')
#X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number

l_col_str = ["BatchId", "AccountId", "SubscriptionId", "CustomerId", "ProviderId", "ProductId", "ChannelId"]
for col in l_col_str:
    data[['dc', 'new_col']] = data[col].str.split("_", expand = True)
    data.drop(['dc',col], inplace=True, axis=1)
    data.rename(columns={"new_col": col}, inplace=True)
    data[col] = data[col].astype('int')
    X_test[['dc', 'new_col']] = X_test[col].str.split("_", expand = True)
    X_test.drop(['dc',col], inplace=True, axis=1)
    X_test.rename(columns={"new_col": col}, inplace=True)
    X_test[col] = X_test[col].astype('int')
data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

#Other data
#Information on columns on raw_data
## P-e transformer ca en une fonction ? 
info = pd.DataFrame(data = raw_data.dtypes)
info.reset_index(inplace=True)
info.rename({'index':'Column Name', 0: 'Dtype'}, axis=1, inplace=True)
describe = def_feature.copy()
describe = describe.merge(info)
unique_val = []
for col in list(describe["Column Name"]) : 
    unique_val.append(len(raw_data[col].unique()))
describe["unique"]=unique_val
describe = added_column(describe)
cat_cols = [col for col in train_X.columns if train_X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie

#print(describe)


#low_cardinality_cols=["ProviderId", "ProductCategory", "ChannelId", "PricingStrategy"] 
#train_X[low_cardinality_cols] = train_X[low_cardinality_cols].astype(str) 
#val_X[low_cardinality_cols] = val_X[low_cardinality_cols].astype(str) 

OH_encoder = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
#print(f'Col avant : {train_X.columns}')
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)
#print(f'col num : {num_X_train.columns}')

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
#print(f'col apres concat : {OH_X_train.columns}')
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid