# **Credit Card Fraud Detection**

Link to the dataset: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

### Import the necessary libraries

In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For advanced data visualization
import kagglehub  # For downloading datasets from Kaggle
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.ensemble import RandomForestClassifier  # For creating a classification model with Random Forest
from sklearn.metrics import classification_report, accuracy_score  # For evaluating the model's performance

import warnings  # For handling warnings
warnings.filterwarnings("ignore")  # Ignore warnings

### Import and organize the dataset

In [None]:
# Organize the data into a dataframe

# Download the dataset from Kaggle
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

# Load the CSV file into a dataframe
data = pd.read_csv(f"{path}/creditcard.csv")

# Display the first 10 rows
data.head(10)

### Clean the data

a. Missing values

In [None]:
# Check for missing values in each column
missing_values = data.isnull().sum()

# Display the missing values per column
print("Missing values per column:")
print(missing_values)

b. Duplicate Data

In [None]:
# Check for duplicate rows in the dataframe
duplicate_rows = data.duplicated().sum()

# Display the number of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows}")


### Analyze the data

Question 1: What is the percentage of fraudulent transactions in the dataset?


In [None]:
# Count the total number of transactions
total_transactions = len(data)

# Count the number of fraudulent transactions (where "Class" is 1)
fraudulent_transactions = data[data['Class'] == 1].shape[0]

# Calculate the percentage of fraudulent transactions
fraudulent_percentage = (fraudulent_transactions / total_transactions) * 100

# Display the percentage of fraudulent transactions
print(f"Percentage of fraudulent transactions: {fraudulent_percentage:.2f}%")

Question 2: What is the average amount of fraudulent transactions?


In [None]:
# Filter the fraudulent transactions
fraudulent_data = data[data['Class'] == 1]

# Calculate the average amount of fraudulent transactions
average_fraudulent_amount = fraudulent_data['Amount'].mean()

# Display the average amount of fraudulent transactions
print(f"Average amount of fraudulent transactions: {average_fraudulent_amount:.2f}")


### Visualize the data


Question 1: How many fraudulent transactions are there compared to non-fraudulent transactions? (Use a bar chart)

In [None]:
#To create a bar chart, we can use the `plot()` method from pandas or the `bar()` method from matplotlib. Here's the code to fit at $PLACEHOLDER$:
# Count the number of fraudulent and non-fraudulent transactions
transaction_counts = data['Class'].value_counts()

# Display the distribution of fraudulent vs non-fraudulent transactions
plt.figure(figsize=(8, 6))
transaction_counts.plot(kind='bar', color=['red', 'green'])
plt.title('Number of Fraudulent vs Non-Fraudulent Transactions')
plt.xlabel('Class (0: No Fraud, 1: Fraud)')
plt.ylabel('Number of Transactions')
plt.xticks([0, 1], ['No Fraud', 'Fraud'], rotation=0)
plt.show()

Question 2: What is the distribution of the amounts of fraudulent transactions? (Use a histogram)


In [None]:
# Separate the fraudulent transaction data
fraudulent_data = data[data['Class'] == 1]
# Display the distribution of the amounts of fraudulent transactions
plt.figure(figsize=(10, 6))
plt.hist(fraudulent_data['Amount'], bins=50, color='red', alpha=0.7)
plt.title('Distribution of Fraudulent Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

## Model Development and Evaluation

### Split the dataset

In [None]:
# Create the dataframe X with all columns except 'Class'
X = data.drop('Class', axis=1)

# Create the series y with only the 'Class' column
y = data['Class']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the dimensions of the resulting sets
print(f"Training set size (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set size (X_test, y_test): {X_test.shape}, {y_test.shape}")

### Create and evaluate models

In [None]:
# Train the model with the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Display the accuracy as a percentage
print(f"Model Accuracy: {accuracy * 100:.2f}%")