**Anti-Fraud Model** by *Sohail, Ahmed,* and *Hiten*

---

Using a **Feedforward Neural Network (FNN)** because our dataset is structured and tabular.

Our initial goal was that....

In [None]:
# IMPORTING NECESSARY LIBRARIES
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import KernelPCA

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input
import tensorflow as tf

import time
import seaborn as sns
import category_encoders as ce

In [None]:
# LOADING THE DATASET

import pandas as pd

df = pd.read_csv('Dataset\Cleaned_AntiFraud_Centre_Dataset.csv')

print(df.head())
print(f"\n\n{df.describe(include='all')}")
print("\n")

# Data type of each column
print(df.dtypes)

------------------------------------ **Graphing and Analysis of Raw Data** --------------------------------

In [None]:
# GRAPH AND ANALYSIS
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('Dataset\Cleaned_AntiFraud_Centre_Dataset.csv')

# Statistical Summary of the dataset, some of the outputs are for numerical features only 
# so it will result in a value of NaN in the categorical features output
print(df.describe(include='all'))

# Our target is "Complaint Type". A victim = a fraud case and others are not fraud cases.
sns.countplot(x='Complaint Type', data=df)
plt.title('Class Distribution')
plt.show()

print("\n\n\n")


In [None]:
# FUNCTION TO SEE CORRELATION OF FEATURES - TARGET
# FUNCTION WILL PRINT 9 GRAPHS

import pandas as pd
df = pd.read_csv('Dataset\Cleaned_AntiFraud_Centre_Dataset.csv')
import seaborn as sns
import matplotlib.pyplot as plt

def plot_feature_target_correlation(df, columns, target, top_n=10):
    for col in columns:
        
        top_categories = df[col].value_counts().nlargest(top_n).index
        filtered_df = df[df[col].isin(top_categories)]
        
        sns.countplot(x=col, hue=target, data=filtered_df)
        plt.title(f" Top {top_n} {col} vs {target}")
        plt.xticks(rotation=45)
        plt.figure(figsize=(16,6))
        plt.show()
        
        
columns = ['Date Received', 'Complaint Received Type', 'Country', 'Province/State', 'Fraud and Cybercrime Thematic Categories', 
           'Solicitation Method', 'Gender', 'Language of Correspondence', 'Victim Age Range']

target = 'Complaint Type'

plot_feature_target_correlation(df, columns, target)

In [None]:
# VIEWING TARGET VARIABLE STATISTICS

print(df["Complaint Type"].value_counts())

----------------------------------------- **Pre-processing and Handling Missing Data** ------------------------------------------------

In [None]:
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_csv('Dataset\Cleaned_AntiFraud_Centre_Dataset.csv')

## [--HANDLING MISSING DATA--]---------------------------------------

# Convert Not Specified, Not Available, etc. into NaN value
df.replace(["Not Specified", "Not Available / non disponible", "Not Available"], pd.NA, inplace=True)

# Change NaN values of Province/State to Unknown
df['Province/State'] = df['Province/State'].fillna('Unknown')

# Change NaN Values of Country to Unknown
df['Country'] = df['Country'].fillna('Unknown')

# Change NaN Values of Solicitation Method to mode (most frequent value) to minimize
# missingness and reduce bias
mode_solicitation = df["Solicitation Method"].mode()[0]
df["Solicitation Method"] = df["Solicitation Method"].fillna(mode_solicitation)

# Change Gender NaN values to Unknown
df['Gender'] = df['Gender'].fillna('Unknown')

# Change Language of Correspondence NaN values to Unknown
df['Language of Correspondence'] = df['Language of Correspondence'].fillna('Unknown')

# Convert $x.xx to float number and replace missing values or 0 with a computed mean.
df["Dollar Loss"] = df["Dollar Loss"].replace('[\$,]', '', regex=True).astype(float)
df["Dollar Loss"] = df["Dollar Loss"].fillna(df["Dollar Loss"].mean())


## [--EXTRACTING DATE FEATURES--]-------------------------------------------------------

# Convert to DateTime format
df['Date Received'] = pd.to_datetime(df['Date Received'], errors='coerce')

# Create new features "Year", "Month", "Day", and "DayOfTheWeek"
df['Year'] = df['Date Received'].dt.year
df['Month'] = df['Date Received'].dt.month
df['Day'] = df['Date Received'].dt.day
df['DayOfTheWeek'] = df['Date Received'].dt.dayofweek


## [--VICTIM AGE RANGE TO ORDINAL VARIABLE--]---------------------------------------------------

age_order = {
    "'Not Available / non disponible": 0,
    "'Under 20": 1,
    "'20 - 29": 2,
    "'30 - 39": 3,
    "'40 - 49": 4,
    "'50 - 59": 5,
    "'60 - 69": 6,
    "'70 - 79": 7,
    "'80 and over": 8
}

df["Victim Age Range"] = df["Victim Age Range"].map(age_order)

## [--TARGET VAIRABLE --> BINARY--]---------------------------------------------------------------

df['Is_Fraud'] = df['Complaint Type'].apply(lambda x: 1 if x.strip() == 'Victim' else 0)

## [--DROPPING UNNECESSARY COLUMNS--]-------------------------
cols_to_drop = [
    "Number ID",        # Just an identifier no real impact
    "Complaint Type",   # Redundant because we are changed it to binary
    "Date Received",    # Already split into year, day, month, and day of the week
]

df = df.drop(columns = cols_to_drop, axis=1)

## [--SCALING NUMERICAL VARIABLES--]--------------------------------------------------------------------

# Fixing cyclic variables to reflect actual cycles (months, days of the week)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
df['DayOfTheWeek_sin'] = np.sin(2 * np.pi * df['DayOfTheWeek'] / 7)
df['DayOfTheWeek_cos'] = np.cos(2 * np.pi * df['DayOfTheWeek'] / 7)

scaler = StandardScaler()

numerical_features = ['Victim Age Range', 'Number of Victims', 'Dollar Loss', 'Year', 'Month_sin', 'Month_cos', 'DayOfTheWeek_sin', 'DayOfTheWeek_cos' ]

df[numerical_features] = scaler.fit_transform(df[numerical_features])

-------------------------------------------------------- **Splitting Into Training and Testing Split** -------------------------------------------------

In [None]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from category_encoders import TargetEncoder 
from sklearn.preprocessing import LabelEncoder, StandardScaler

## [--DEFINING THE TESTING/TRAINING SPLIT--]-------------------------------------------------

x = df.drop(columns=["Is_Fraud"])
y = df["Is_Fraud"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

## [--TARGET ENCODING "COUNTRY" (HIGH CARDINALITY)--]----------------------------------------

encoder = TargetEncoder()
x_train["Country"] = encoder.fit_transform(x_train["Country"], y_train)

x_test["Country"] = encoder.transform(x_test["Country"])

## [--LABEL ENCODING CATEGORICAL VARIABLES]--------------------------------------------------

label_encoder = LabelEncoder()
x_train["Province/State"] = label_encoder.fit_transform(x_train["Province/State"], y_train)
x_train["Complaint Received Type"] = label_encoder.fit_transform(x_train["Complaint Received Type"], y_train)
x_train["Gender"] = label_encoder.fit_transform(x_train["Gender"], y_train)
x_train["Fraud and Cybercrime Thematic Categories"] = label_encoder.fit_transform(x_train["Fraud and Cybercrime Thematic Categories"], y_train)
x_train["Solicitation Method"] = label_encoder.fit_transform(x_train["Solicitation Method"], y_train)
x_train["Language of Correspondence"] = label_encoder.fit_transform(x_train["Language of Correspondence"], y_train)

x_test["Province/State"] = label_encoder.transform(x_test["Province/State"])
x_test["Complaint Received Type"] = label_encoder.transform(x_test["Complaint Received Type"])
x_test["Gender"] = label_encoder.transform(x_test["Gender"])
x_test["Fraud and Cybercrime Thematic Categories"] = label_encoder.transform(x_test["Fraud and Cybercrime Thematic Categories"])
x_test["Solicitation Method"] = label_encoder.transform(x_test["Solicitation Method"])
x_test["Language of Correspondence"] = label_encoder.transform(x_test["Language of Correspondence"])


-------------------------------------------------------- **Building a Deep Learning FNN Model** --------------------------------------------------------

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import KernelPCA

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input
import tensorflow as tf

import time
import seaborn as sns
import category_encoders as ce

## DEFINE INPUT LAYERS

province_input = Input(shape=(1,), name="Province/State")
country_input = Input(shape=(1,), name="Country")    
dollar_loss_input = Input(shape=(1,), name="Dollar Loss")
victim_age_range_input = Input(shape=(1,), name="Victim Age Range")
complaint_received_type_input = Input(shape=(1,), name="Complaint Received Type")
fraud_and_cybercrime_category_input = Input(shape=(1,), name="Fraud and Cybercrime Thematic Categories")
solicitation_input = Input(shape=(1,), name="Solicitation Method")
gender_input = Input(shape=(1,), name="Gender")
number_of_victims_input = Input(shape=(1,), name="Number of Victims")
year_input = Input(shape=(1,), name="Year")
month_sin_input = Input(shape=(1,), name="Month_sin")
month_cos_input = Input(shape=(1,), name="Month_cos")
dayoftheweek_sin_input = Input(shape=(1,), name="DayOfTheWeek_sin")
dayoftheweek_cos_input = Input(shape=(1,), name="DayOfTheWeek_cos")
day_input = Input(shape=(1,), name="Day")

# CONECTATE ALL INPUTS
all_inputs = Concatenate()([
    province_input,
    country_input,
    dollar_loss_input,
    victim_age_range_input,
    complaint_received_type_input,
    fraud_and_cybercrime_category_input,
    solicitation_input,
    gender_input,
    number_of_victims_input,
    year_input,
    month_sin_input,
    month_cos_input,
    dayoftheweek_sin_input,
    dayoftheweek_cos_input,
    day_input
])

# DEFINE HIDDEN LAYERS
x = Dense(128, activation='relu')(all_inputs)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)

# OUTPUT LAYER
output = Dense(1, activation='sigmoid')(x)

       Complaint Received Type Country Province/State  \
count                   313976  313976         313976   
unique                      10     154             66   
top               CAFC Website  Canada        Ontario   
freq                    164920  238635          91683   
mean                       NaN     NaN            NaN   
std                        NaN     NaN            NaN   
min                        NaN     NaN            NaN   
25%                        NaN     NaN            NaN   
50%                        NaN     NaN            NaN   
75%                        NaN     NaN            NaN   
max                        NaN     NaN            NaN   

       Fraud and Cybercrime Thematic Categories Solicitation Method  Gender  \
count                                    313976              313976  313976   
unique                                       39                  13       5   
top                              Identity Fraud       Other/unknown  Female   