In [12]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

In [13]:
data = pd.read_csv('fraud_data_calculations.csv')

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Total Number of LORs,Invalid Number of LORs,Similarity Score,Cyclic LOR
0,0,0.0,2.0,0,0.844969,0.0
1,1,1.0,4.0,1,0.887695,0.0
2,2,2.0,5.0,2,0.887838,0.0
3,3,3.0,2.0,1,0.690539,1.0
4,4,4.0,7.0,1,0.810093,0.0


In [15]:
data.drop('Unnamed: 0', axis = True, inplace = True)

In [16]:
data.head()

Unnamed: 0,ID,Total Number of LORs,Invalid Number of LORs,Similarity Score,Cyclic LOR
0,0.0,2.0,0,0.844969,0.0
1,1.0,4.0,1,0.887695,0.0
2,2.0,5.0,2,0.887838,0.0
3,3.0,2.0,1,0.690539,1.0
4,4.0,7.0,1,0.810093,0.0


In [17]:
data.columns

Index(['ID', 'Total Number of LORs', 'Invalid Number of LORs',
       'Similarity Score', 'Cyclic LOR'],
      dtype='object')

In [18]:
# Step 1: Calculate the ratio of Invalid LORs to Total LORs
data['Invalid_LOR_Ratio'] = data['Invalid Number of LORs'] / data['Total Number of LORs']

In [19]:
# Initializing the weights for the parameters
weight_invalid_lor_ratio = -1
weight_similarity_score = 1
weight_cyclic_lor = 1

In [20]:
# Calculating the weighted sum of the parameters
data['fraud_score'] = (
    (data['Invalid_LOR_Ratio'] * weight_invalid_lor_ratio) +
    (data['Similarity Score'] * weight_similarity_score) +
    (data['Cyclic LOR'] * weight_cyclic_lor)
)

In [21]:
data.head()

Unnamed: 0,ID,Total Number of LORs,Invalid Number of LORs,Similarity Score,Cyclic LOR,Invalid_LOR_Ratio,fraud_score
0,0.0,2.0,0,0.844969,0.0,0.0,0.844969
1,1.0,4.0,1,0.887695,0.0,0.25,0.637695
2,2.0,5.0,2,0.887838,0.0,0.4,0.487838
3,3.0,2.0,1,0.690539,1.0,0.5,1.190539
4,4.0,7.0,1,0.810093,0.0,0.142857,0.667236


In [22]:
# Now using Decision Trees (Imformation Gain) to update the weights
features = ['Invalid_LOR_Ratio', 'Similarity Score', 'Cyclic LOR']
X_fraud = data[features]
y_fraud = data['fraud_score']

In [23]:
# Precuation: Removing any nan or infinite values
X_fraud_cleaned = X_fraud.replace([np.inf, -np.inf], np.nan).dropna()
y_fraud_cleaned = y_fraud.loc[X_fraud_cleaned.index]

In [24]:
tree_model_fraud = DecisionTreeRegressor(random_state=42)
tree_model_fraud.fit(X_fraud_cleaned, y_fraud_cleaned)

In [25]:
feature_importances_fraud = tree_model_fraud.feature_importances_

In [26]:
updated_weight_invalid_lor_ratio = feature_importances_fraud[0]
updated_weight_similarity_score = feature_importances_fraud[1]
updated_weight_cyclic_lor = feature_importances_fraud[2]

In [27]:
data['fraud_score_updated'] = (
    (data['Invalid_LOR_Ratio'] * updated_weight_invalid_lor_ratio) +
    (data['Similarity Score'] * updated_weight_similarity_score) +
    (data['Cyclic LOR'] * updated_weight_cyclic_lor)
)

In [28]:
# Scaling the values into  
min_value_updated = data['fraud_score_updated'].min()
max_value_updated = data['fraud_score_updated'].max()

In [29]:
data['fraud_score_updated_normalized'] = (
    (data['fraud_score_updated'] - min_value_updated) / (max_value_updated - min_value_updated)
)

In [30]:
data.head()

Unnamed: 0,ID,Total Number of LORs,Invalid Number of LORs,Similarity Score,Cyclic LOR,Invalid_LOR_Ratio,fraud_score,fraud_score_updated,fraud_score_updated_normalized
0,0.0,2.0,0,0.844969,0.0,0.0,0.844969,0.05123,0.017983
1,1.0,4.0,1,0.887695,0.0,0.25,0.637695,0.121328,0.103368
2,2.0,5.0,2,0.887838,0.0,0.4,0.487838,0.161841,0.152717
3,3.0,2.0,1,0.690539,1.0,0.5,1.190539,0.846223,0.986358
4,4.0,7.0,1,0.810093,0.0,0.142857,0.667236,0.087691,0.062396


In [38]:
# Exporting only the ID and fraud_scores_updated_normalized in a csv
output_df = data[['ID', 'fraud_score_updated_normalized']]

# Saving the dataframe to a CSV file
output_file_path = 'fraud_score_updated_normalized.csv'
output_df.to_csv(output_file_path, index=False)