In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#fraudtrain and fraudtest are CSV files, we can read them into dataframes
fraudtrain = pd.read_csv('fraudtrain.csv')
fraudtest = pd.read_csv('fraudtest.csv')

# Combine the datasets vertically (stack them on top of each other)
credit_card_df = pd.concat([fraudtrain, fraudtest], ignore_index=True)

credit_card_df.head(3)

In [None]:
dataset_length = len(credit_card_df)
fraudtrain_length = len(fraudtrain)
fraudtest_length = len(fraudtest)

print("fraudtrain Dataset length:", fraudtrain_length)
print("fraudtest Dataset length:", fraudtest_length)
print("credit card Dataset length:", dataset_length)

In [None]:
# Get the names of columns in the DataFrame
column_names = credit_card_df.columns
# Print the column names
print(column_names)

In [None]:
# Display basic information about the dataset
print(credit_card_df.info())

# Data Cleaning:

In [None]:
# Specify the column to remove
column_to_remove = 'Unnamed: 0'
# Remove the specified column
credit_card_df = credit_card_df.drop(column_to_remove, axis=1)
# Display the DataFrame after removing the column
print('\nDataFrame after removing the column "Unnamed: 0"')
credit_card_df.info()

In [None]:
# Name the index column
credit_card_df = credit_card_df.rename_axis('Index ID')

# Display the DataFrame after naming the index column
credit_card_df.head(3)

In [None]:
# Check the current index of the DataFrame
credit_card_df.shape
print(credit_card_df.index)
credit_card_df.shape

### Checking Missing Values:

In [None]:
missing_values = credit_card_df.isnull().sum()

# Display missing values
print("\nTotal Missing Values in the Entire Dataset:", missing_values.sum())
print("Missing Values in Each Column:")
print(missing_values)

### Checking duplicate values

In [None]:
# The `duplicated()` function returns a boolean Series indicating duplicate rows
duplicates = credit_card_df.duplicated()

# Display duplicate rows
duplicate_rows = [duplicates]
print("Duplicate Rows:")
print(duplicate_rows)


In [None]:
# Get the rows that are duplicates
duplicate_rows = credit_card_df[credit_card_df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

### Checking data distribution of numerical features

In [None]:
# Set the seed for reproducibility
np.random.seed(42)

# Select numerical columns for visualization
numerical_columns = ['cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']

# Set the style for the plots
sns.set(style="whitegrid")

# Plot histograms for numerical columns
plt.figure(figsize=(15, 10))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(3, 4, i)
    sns.histplot(credit_card_df[column], kde=True)
    plt.title(f'Histogram - {column}')

plt.tight_layout()
plt.show()

# Plot box plots for numerical columns
plt.figure(figsize=(15, 10))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(3, 4, i)
    sns.boxplot(x=credit_card_df[column])
    plt.title(f'Box Plot - {column}')

plt.tight_layout()
plt.show()

# Plot scatter plots for pairs of numerical columns (using a subsample)
plt.figure(figsize=(15, 10))
sns.pairplot(credit_card_df.sample(frac=0.1)[numerical_columns], hue='is_fraud', diag_kind='kde')
plt.suptitle('Pair Plot for Numerical Columns (Subsample)', y=1.02)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of categorical columns in the DataFrame
categorical_columns = ['gender',  'category',  'state']

# Set the style for the plots
sns.set(style="whitegrid")

# Plot bar charts or count plots for each categorical column
for column in categorical_columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(x=column, data=credit_card_df, palette='viridis')
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
    plt.show()


In [None]:
columns_to_check = [
    'trans_date_trans_time', 'merchant', 'category', 'first', 'last',
    'gender', 'street', 'city', 'state', 'job', 'dob'
]

for column in columns_to_check:
    unique_values = credit_card_df[column].unique()
    print(f"Unique values in '{column}': {unique_values}\n")


In [None]:
credit_card_df['merchant'] = credit_card_df['merchant'].str.replace('fraud_', '')

# Display the updated DataFrame
print(credit_card_df['merchant'])

In [None]:
credit_card_df.head(5)

In [None]:
credit_card_df.isnull().any()

In [None]:
# Step 2: Display summary statistics for numeric columns
numeric_summary = credit_card_df.describe()
print("\nSummary Statistics for Numeric Columns:")
print(numeric_summary)

# Step 3: Calculate mean, median, and standard deviation for specific columns (e.g., 'amt')
mean_amt = credit_card_df['amt'].mean()
median_amt = credit_card_df['amt'].median()
std_amt = credit_card_df['amt'].std()

# Display the calculated values
print("\nMean Transaction Amount:", mean_amt)
print("Median Transaction Amount:", median_amt)
print("Standard Deviation of Transaction Amount:", std_amt)

# Step 4: Analyze summary statistics for the entire dataset
# This will include non-numeric columns as well
full_summary = credit_card_df.describe(include='all')
print("\nSummary Statistics for the Entire Dataset:")
print(full_summary)

### Finding and dealing with outliers

In [None]:
# List of numeric columns in the DataFrame
numeric_columns = credit_card_df.select_dtypes(include=np.number).columns

# Set a common threshold for IQR (e.g., 1.5)
iqr_threshold = 1.5

# Dictionary to store outlier values for each numeric column
outlier_values_dict = {}

# Iterate through numeric columns
for col in numeric_columns:
    # Skip the target column 'is_fraud'
    if col == 'is_fraud':
        continue
    
    # Calculate IQR
    Q1 = credit_card_df[col].quantile(0.25)
    Q3 = credit_card_df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - iqr_threshold * IQR
    upper_bound = Q3 + iqr_threshold * IQR
    
    # Identify outliers based on IQR
    outliers = (credit_card_df[col] < lower_bound) | (credit_card_df[col] > upper_bound)
    
    # Get the actual values of outliers
    outlier_values = credit_card_df[col][outliers]
    
    # Store outlier values in the dictionary
    outlier_values_dict[col] = outlier_values
    
    # Visualize the distribution and outliers using a boxplot
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=credit_card_df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()
    
    # Display the identified outliers for each column
    print(f"Identified Outliers in '{col}' based on IQR:")
    print(outlier_values)
    print("\n")


In [None]:
# Create a copy of the original DataFrame to preserve the original data
credit_card_df_no_outliers = credit_card_df.copy()

# List of numeric columns in your DataFrame
numeric_columns = credit_card_df_no_outliers.select_dtypes(include=np.number).columns

# Set a common threshold for IQR (e.g., 1.5)
iqr_threshold = 1.5

# Iterate through numeric columns
for col in numeric_columns:
    # Skip the target column 'is_fraud'
    if col == 'is_fraud':
        continue
    
    # Calculate IQR
    Q1 = credit_card_df_no_outliers[col].quantile(0.25)
    Q3 = credit_card_df_no_outliers[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - iqr_threshold * IQR
    upper_bound = Q3 + iqr_threshold * IQR
    
    # Identify and remove outliers based on IQR
    outliers = (credit_card_df_no_outliers[col] < lower_bound) | (credit_card_df_no_outliers[col] > upper_bound)
    credit_card_df_no_outliers = credit_card_df_no_outliers[~outliers]

# Display the shape of the new DataFrame without outliers
print("Shape of DataFrame without outliers:", credit_card_df_no_outliers.shape)


In [None]:
new_credit_card_df_no_outliers_iqr=len(credit_card_df_no_outliers)
new_credit_card_df=len(credit_card_df)
print("credit_card_df_no_outliers_iqr: ",new_credit_card_df_no_outliers_iqr)
print("credit_card_df: ", new_credit_card_df)

In [None]:
# Check the distribution of the target column in credit_card_df_no_outliers
target_distribution_no_outliers = credit_card_df_no_outliers['is_fraud'].value_counts()
# Check the distribution of the target column in credit_card_df
target_distribution_credit_card_df = credit_card_df['is_fraud'].value_counts()


# Display the distribution
print("\nTarget Distribution (Dataset without Outliers):")
print(target_distribution_no_outliers)

# Display the distribution
print("\nTarget Distribution (Original Dataset):")
print(target_distribution_credit_card_df)

In [None]:
# List of numeric columns in your DataFrame
numeric_columns = credit_card_df_no_outliers.select_dtypes(include=np.number).columns

# Set the number of bins for histograms
num_bins = 30

# Iterate through numeric columns
for col in numeric_columns:
    # Visualize the distribution using a histogram
    plt.figure(figsize=(12, 6))
    sns.histplot(credit_card_df_no_outliers[col], bins=num_bins, kde=True)
    plt.title(f'Distribution of {col} (After Removing Outliers)')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

# List of numeric columns in your DataFrame
numeric_columns = credit_card_df_no_outliers.select_dtypes(include=np.number).columns

# Set the number of bins for CDF plots
num_bins_cdf = 100

# Iterate through numeric columns
for col in numeric_columns:
    # Visualize the distribution using a CDF plot
    plt.figure(figsize=(12, 6))
    
    # Calculate ECDF
    ecdf = ECDF(credit_card_df_no_outliers[col])
    
    # Create an array of values for the x-axis
    x_values = np.linspace(min(credit_card_df_no_outliers[col]), max(credit_card_df_no_outliers[col]), num_bins_cdf)
    
    # Plot CDF
    plt.step(x_values, ecdf(x_values), marker='o')
    
    plt.title(f'Cumulative Distribution Function (CDF) of {col} (After Removing Outliers)')
    plt.xlabel(col)
    plt.ylabel('Cumulative Probability')
    plt.grid(True)
    plt.show()

In [None]:
#!pip install folium

In [None]:
import folium
from folium.plugins import MarkerCluster

fraud_data = credit_card_df_no_outliers[credit_card_df_no_outliers['is_fraud'] == 1]

# Create a base map centered around the average latitude and longitude
fraud_map = folium.Map(location=[fraud_data['lat'].mean(), fraud_data['long'].mean()], zoom_start=4)

# Create a MarkerCluster layer for better visualization of multiple markers
marker_cluster = MarkerCluster().add_to(fraud_map)

# Add markers for each fraud entry
for index, row in fraud_data.iterrows():
    folium.Marker([row['lat'], row['long']], popup=f"Fraud Transaction\nAmount: ${row['amt']:.2f}").add_to(marker_cluster)

# Save or display the map
fraud_map.save('fraud_map.html')  # Save the map as an HTML file
fraud_map  # Display the map in Jupyter Notebook


## Data Transformation


In [None]:
credit_card_df_no_outliers['trans_date_trans_time'] = pd.to_datetime(credit_card_df_no_outliers['trans_date_trans_time'])
credit_card_df_no_outliers['dob'] = pd.to_datetime(credit_card_df_no_outliers['dob'])
# Verify the changes
print(credit_card_df_no_outliers.dtypes)


In [None]:
# Check for missing values in 'trans_date_trans_time' and 'dob'
missing_values_trans_time = credit_card_df['trans_date_trans_time'].isnull().sum()
missing_values_dob = credit_card_df['dob'].isnull().sum()

print(f"Missing values in 'trans_date_trans_time': {missing_values_trans_time}")
print(f"Missing values in 'dob': {missing_values_dob}")

# Check for duplicates in 'trans_date_trans_time' and 'dob'
duplicates_trans_time = credit_card_df.duplicated(subset='trans_date_trans_time').sum()
duplicates_dob = credit_card_df.duplicated(subset='dob').sum()

print(f"Duplicates in 'trans_date_trans_time': {duplicates_trans_time}")
print(f"Duplicates in 'dob': {duplicates_dob}")


In [None]:
# Find and display rows with duplicated 'trans_date_trans_time'
duplicated_trans_time_rows = credit_card_df[credit_card_df.duplicated(subset='trans_date_trans_time', keep=False)]
print(f"Rows with duplicated 'trans_date_trans_time':")
print(duplicated_trans_time_rows)

# Find and display rows with duplicated 'dob'
duplicated_dob_rows = credit_card_df[credit_card_df.duplicated(subset='dob', keep=False)]
print(f"Rows with duplicated 'dob':")
print(duplicated_dob_rows)


In [None]:
# Check for duplicate rows in the entire DataFrame
duplicated_rows = credit_card_df[credit_card_df.duplicated(keep=False)]
print(f"Duplicate Rows in the Entire DataFrame:")
print(duplicated_rows)


In [None]:
# List of columns to check
columns_to_check = ['category', 'gender', 'job', 'state']

# Iterate through columns
for col in columns_to_check:
    unique_values = credit_card_df[col].unique()
    value_counts = credit_card_df[col].value_counts()
    
    print(f"\nColumn: {col}")
    print("Unique Values:")
    print(unique_values)
    
    print("\nValue Counts:")
    print(value_counts)
    
    # Calculate the percentage of unique values
    percentage_unique = (len(unique_values) / len(credit_card_df)) * 100
    print(f"\nPercentage of Unique Values: {percentage_unique:.2f}%\n")


In [None]:
# Convert 'category' column to categorical
credit_card_df_no_outliers['category'] = credit_card_df_no_outliers['category'].astype('category')

# Convert 'gender' column to categorical
credit_card_df_no_outliers['gender'] = credit_card_df_no_outliers['gender'].astype('category')

# Convert 'state' column to categorical
credit_card_df_no_outliers['state'] = credit_card_df_no_outliers['state'].astype('category')

# Check the data types after conversion
print(credit_card_df_no_outliers.dtypes)


## feature engineering

In [None]:
from datetime import datetime
import pandas as pd

# Convert 'dob' column to datetime
credit_card_df_no_outliers['dob'] = pd.to_datetime(credit_card_df_no_outliers['dob'])

# Calculate age based on the current date
current_date = datetime.now()
credit_card_df_no_outliers['age'] = (
    pd.to_numeric(current_date - credit_card_df_no_outliers['dob']) / (365.25 * 24 * 60 * 60)
)

# Display the updated DataFrame
print(credit_card_df_no_outliers[['dob', 'age']])


In [None]:
!pip install geopy


In [None]:
from geopy.distance import geodesic

column_order = ['first', 'last','gender', 'street', 'city', 'state','zip', 'lat', 'long','merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt','merchant','category','trans_date_trans_time', 
                'job', 'dob', 'age', 'trans_num', 'unix_time', 'is_fraud']


# Update the DataFrame with the new column order
credit_card_df_no_outliers = credit_card_df_no_outliers[column_order]

# Display the updated DataFrame
credit_card_df_no_outliers.info()


In [None]:
# Specifying the number of random rows we want to select
num_rows_to_select = 10  # Change this number as needed

# Randomly sample rows based on the 'unix_time' column
random_rows = credit_card_df_no_outliers.sample(n=num_rows_to_select, random_state=42)

# Display the randomly selected rows
print(random_rows[['unix_time']])


In [None]:
import pandas as pd

# unix_time is the column containing Unix time
credit_card_df_no_outliers['unix_time'] = pd.to_datetime(credit_card_df_no_outliers['unix_time'], unit='s')

# Display the updated DataFrame with datetime format
print(credit_card_df_no_outliers[['unix_time']])


In [None]:
credit_card_df_no_outliers.head()

In [None]:
credit_card_df_no_outliers['unix_time'] = pd.to_datetime(credit_card_df_no_outliers['unix_time'], unit='s')

# Extract the hour of the day
credit_card_df_no_outliers['transaction_hour'] = credit_card_df_no_outliers['unix_time'].dt.hour

# Analyze fraud occurrences by hour
fraud_by_hour = credit_card_df_no_outliers.groupby('transaction_hour')['is_fraud'].mean()

# Assuming fraud_by_hour is our Series
plt.figure(figsize=(10, 6))
fraud_by_hour.plot(kind='bar', color='skyblue')
plt.title('Fraud Occurrences by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Fraud Occurrence Rate')
plt.xticks(rotation=0)
plt.show()

# Display the fraud occurrences by hour
print(fraud_by_hour)

In [None]:
credit_card_df_no_outliers['unix_time'] = pd.to_datetime(credit_card_df_no_outliers['unix_time'], unit='s')

# Extract the month and day of the week
credit_card_df_no_outliers['transaction_month'] = credit_card_df_no_outliers['unix_time'].dt.month
credit_card_df_no_outliers['day_of_week'] = credit_card_df_no_outliers['unix_time'].dt.dayofweek

# Analyze fraud occurrences by month
fraud_by_month = credit_card_df_no_outliers.groupby('transaction_month')['is_fraud'].mean()

# Analyze fraud occurrences by day of the week
fraud_by_day = credit_card_df_no_outliers.groupby('day_of_week')['is_fraud'].mean()

# Plot Fraud Occurrences by Month
plt.figure(figsize=(10, 6))
sns.barplot(x=fraud_by_month.index, y=fraud_by_month.values, color='skyblue')
plt.title('Fraud Occurrences by Month')
plt.xlabel('Month')
plt.ylabel('Fraud Occurrence Rate')
plt.show()

# Plot Fraud Occurrences by Day of the Week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(10, 6))
sns.barplot(x=days_of_week, y=fraud_by_day.values, color='lightcoral')
plt.title('Fraud Occurrences by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Fraud Occurrence Rate')
plt.show()


# Display the fraud occurrences by month and day of the week
print("Fraud Occurrences by Month:")
print(fraud_by_month)

print("\nFraud Occurrences by Day of the Week:")
print(fraud_by_day)


In [None]:
# credit_card_df is our DataFrame and 'event_dates' is a list of dates corresponding to holidays or special events
event_dates = ['2023-01-01', '2023-02-14', '2023-07-04']  # Add more dates as needed

# Convert 'trans_date_trans_time' to datetime format
credit_card_df['trans_date_trans_time'] = pd.to_datetime(credit_card_df['trans_date_trans_time'])

# Create a new binary column indicating whether a transaction occurred on a holiday or special event
credit_card_df['is_event'] = credit_card_df['trans_date_trans_time'].dt.date.astype(str).isin(event_dates).astype(int)

# Analyze fraud occurrences during holidays or special events
fraud_by_event = credit_card_df.groupby('is_event')['is_fraud'].mean()

# Display the fraud occurrences during holidays or special events
print(fraud_by_event)


In [None]:
!pip install geopy

In [None]:
from geopy.distance import geodesic


# Transaction Hour
credit_card_df_no_outliers['transaction_hour'] = credit_card_df_no_outliers['unix_time'].dt.hour

# Transaction Day of the Week
credit_card_df_no_outliers['day_of_week'] = credit_card_df_no_outliers['unix_time'].dt.dayofweek

# Transaction Month
credit_card_df_no_outliers['transaction_month'] = credit_card_df_no_outliers['unix_time'].dt.month

# Transaction Amount Categories
# You can customize the bins and labels based on the dataset
bins = [0, 50, 200, np.inf]
labels = ['small', 'medium', 'large']
credit_card_df_no_outliers['amount_category'] = pd.cut(credit_card_df_no_outliers['amt'], 
                                                       bins=bins, labels=labels, right=False)

# Transaction Distance
# Assuming we have 'lat' and 'long' for cardholder and merchant
credit_card_df_no_outliers['distance_km'] = credit_card_df_no_outliers.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).kilometers, axis=1)

# Display the updated DataFrame
credit_card_df_no_outliers.head(5)


In [None]:
# Assuming 'transaction_hour', 'day_of_week', and 'transaction_month' are the column names in your DataFrame
unique_hours = credit_card_df_no_outliers['transaction_hour'].unique()
unique_days_of_week = credit_card_df_no_outliers['day_of_week'].unique()
unique_months = credit_card_df_no_outliers['transaction_month'].unique()

print("Unique values in 'transaction_hour':")
print(unique_hours)

print("\nUnique values in 'day_of_week':")
print(unique_days_of_week)

print("\nUnique values in 'transaction_month':")
print(unique_months)


In [None]:
# 'distance_km' is the column name in our DataFrame
credit_card_df_no_outliers['distance_km'] = credit_card_df_no_outliers['distance_km'].round(2)

In [None]:
credit_card_df_no_outliers.head(5)

In [None]:
column_order = ['first', 'last','gender', 'street', 'city', 'state','zip', 'lat', 'long','merch_lat', 
                'merch_long', 'city_pop', 'cc_num', 'amt','merchant','category','trans_date_trans_time', 
                'job', 'dob', 'age', 'trans_num', 'unix_time','transaction_hour','day_of_week','transaction_month'
                ,'amount_category','distance_km', 'is_fraud']


# Update the DataFrame with the new column order
credit_card_df_no_outliers = credit_card_df_no_outliers[column_order]

# Display the updated DataFrame
credit_card_df_no_outliers.info()

## Data Visualization

### Demographic Analysis:


In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the distribution of transactions based on gender
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='gender', data=credit_card_df_no_outliers, hue='is_fraud')

# Add data labels inside the chart
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.title('Distribution of Transactions Based on Gender')
plt.xlabel('Gender')
plt.ylabel('Transaction Count')
plt.show()


In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Increase the height of the figure
plt.figure(figsize=(16, 16))

# Plot the distribution of transactions across different states
ax = sns.countplot(y='state', data=credit_card_df_no_outliers, hue='is_fraud', orient='h')

# Add data labels inside the chart
for p in ax.patches:
    ax.annotate(f'{p.get_width()}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='left', va='center', xytext=(10, 0), textcoords='offset points')

plt.title('Distribution of Transactions Across Different States')
plt.xlabel('Transaction Count')
plt.ylabel('State')
plt.show()


In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the distribution of transactions across different states using a violin plot
plt.figure(figsize=(14, 10))
sns.violinplot(x='is_fraud', y='state', data=credit_card_df_no_outliers, inner='quartile')

plt.title('Distribution of Transactions Across Different States')
plt.xlabel('Fraudulent Transactions')
plt.ylabel('State')
plt.show()


In [None]:
import folium
from folium.plugins import HeatMap

# Create a base map centered at an approximate location
base_map = folium.Map(location=[37.7749, -122.4194], zoom_start=10)

# Create a HeatMap layer using transaction amounts and city population
heat_data = list(zip(credit_card_df_no_outliers['lat'], credit_card_df_no_outliers['long'], credit_card_df_no_outliers['amt']))
HeatMap(heat_data).add_to(base_map)

# display it
base_map

##### The correlation between transaction amounts and city population using a map, and create a scatter plot on a map where the color of each point is determined by the transaction amount, and the size of the point is determined by the city population

In [None]:
from folium.plugins import MarkerCluster

subset_percentage = 0.1
credit_card_df_subset = credit_card_df_no_outliers.sample(frac=subset_percentage, random_state=42)

# Create a base map
m = folium.Map(location=[credit_card_df_subset['lat'].mean(), credit_card_df_subset['long'].mean()], zoom_start=5)

# Create a MarkerCluster layer
marker_cluster = MarkerCluster().add_to(m)

# Add markers for each data point
for index, row in credit_card_df_subset.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['long']],
        radius=row['city_pop'] / 5000,  # Adjust the scale for better visualization
        color='blue',  # Color of the marker
        fill=True,
        fill_color='blue',  # Fill color of the marker
        fill_opacity=0.5,
        popup=f"Amount: {row['amt']}, Population: {row['city_pop']}"
    ).add_to(marker_cluster)

# Display the map
m

### Geographical Patterns:

In [None]:
# Count the total number of frauds (is_fraud = 1) for each city
total_frauds_by_location = credit_card_df.groupby(['city', 'state'])['is_fraud'].sum().reset_index()

# Sort the DataFrame by the total number of frauds in descending order
total_frauds_by_location = total_frauds_by_location.sort_values(by='is_fraud', ascending=False)

# Plot the top N cities with the highest total number of frauds
top_cities = 10  # You can adjust this based on your preference
plt.figure(figsize=(12, 6))
sns.barplot(x='is_fraud', y='city', data=total_frauds_by_location.head(top_cities))
plt.title(f'Top {top_cities} Cities with Highest Total Number of Frauds')
plt.xlabel('Total Number of Frauds')
plt.ylabel('City')
plt.show()

In [None]:
# Count the total number of frauds (is_fraud = 1) for each state
total_frauds_by_state = credit_card_df.groupby('state')['is_fraud'].sum().reset_index()

# Sort the DataFrame by the total number of frauds in descending order
total_frauds_by_state = total_frauds_by_state.sort_values(by='is_fraud', ascending=False)

# Plot the top N states with the highest total number of frauds
top_states = 10  # You can adjust this based on your preference
plt.figure(figsize=(12, 6))
sns.barplot(x='is_fraud', y='state', data=total_frauds_by_state.head(top_states))
plt.title(f'Top {top_states} States with Highest Total Number of Frauds')
plt.xlabel('Total Number of Frauds')
plt.ylabel('State')
plt.show()

In [None]:
# Scatter plot for merchant locations with fraud highlighted
plt.figure(figsize=(10, 8))
scatter = sns.scatterplot(x='merch_long', y='merch_lat', hue='is_fraud',
                          data=credit_card_df, palette={0: 'blue', 1: 'red'}, alpha=0.5)
plt.title('Merchant Locations with Fraud Highlighted')
plt.xlabel('Merchant Longitude')
plt.ylabel('Merchant Latitude')

# Customize legend
legend_labels = ['Not Fraud', 'Fraud']
legend_handles = scatter.legend_.legendHandles
plt.legend(legend_handles, legend_labels, title='Fraud', loc='upper right')
plt.show()

In [None]:
# Filter the DataFrame for fraudulent transactions
fraudulent_df = credit_card_df[credit_card_df['is_fraud'] == 1]

# Group by merchants and count the number of fraudulent transactions
merchant_fraud_counts = fraudulent_df['merchant'].value_counts().sort_values(ascending=False)

# Select the top 10 merchants
top_merchants = merchant_fraud_counts.head(10)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x=top_merchants.values, y=top_merchants.index, palette='viridis')
plt.title('Top 10 Merchants with Fraudulent Transactions')
plt.xlabel('Number of Fraudulent Transactions')
plt.ylabel('Merchant')
plt.show()

# Display the total number of fraudulent transactions
total_fraudulent_transactions = len(fraudulent_df)
print(f'Total Fraudulent Transactions: {total_fraudulent_transactions}')

In [None]:
# Filter fraudulent transactions
fraudulent_df = credit_card_df[credit_card_df['is_fraud'] == 1]

# Count the number of fraudulent transactions per category
fraudulent_category_counts = fraudulent_df['category'].value_counts().head(10)

# Create a horizontal bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=fraudulent_category_counts.values, y=fraudulent_category_counts.index, palette='viridis')

plt.title('Top 10 Categories with Most Fraudulent Transactions')
plt.xlabel('Number of Fraudulent Transactions')
plt.ylabel('Category')
plt.show()

In [None]:
# Columns needed: 'distance_km' and 'is_fraud'

# Scatter plot for distance vs. fraud
plt.figure(figsize=(10, 8))
sns.scatterplot(x='distance_km', y='is_fraud', data=credit_card_df_no_outliers, alpha=0.5)
plt.title('Distance vs. Fraud')
plt.xlabel('Distance (km)')
plt.ylabel('Fraud (1: Fraud, 0: Not Fraud)')
plt.show()

In [None]:
# Create distance bins
bins = pd.cut(credit_card_df_no_outliers['distance_km'], bins=10)

# Calculate mean fraud rate for each distance bin
fraud_rate_by_distance = credit_card_df_no_outliers.groupby(bins)['is_fraud'].mean().reset_index()

# Create a horizontal bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='is_fraud', y='distance_km', data=fraud_rate_by_distance, color='skyblue')
plt.title('Mean Fraud Rate by Distance Bins')
plt.xlabel('Mean Fraud Rate')
plt.ylabel('Distance (km)')
plt.show()


In [None]:
# Create a violin plot
plt.figure(figsize=(12, 8))
sns.violinplot(x='is_fraud', y='distance_km', data=credit_card_df_no_outliers, palette='pastel', inner='quartile')
plt.title('Distribution of Distance by Fraud')
plt.xlabel('Fraud (1: Fraud, 0: Not Fraud)')
plt.ylabel('Distance (km)')
plt.show()

## Temporal analysis

In [None]:
# Set the style for seaborn plots
sns.set(style="whitegrid")

# Plot 1: Peak hours for transactions
plt.figure(figsize=(12, 8))
sns.countplot(x='transaction_hour', data=credit_card_df_no_outliers, palette='viridis')
plt.title('Number of Transactions by Hour')
plt.xlabel('Transaction Hour')
plt.ylabel('Number of Transactions')
plt.show()

# Plot 2: Fraud rates by hour
plt.figure(figsize=(12, 8))
sns.barplot(x='transaction_hour', y='is_fraud', data=credit_card_df_no_outliers, palette='magma')
plt.title('Fraud Rates by Hour')
plt.xlabel('Transaction Hour')
plt.ylabel('Fraud Rate')
plt.show()

# Plot 3: Day-of-week analysis
plt.figure(figsize=(12, 8))
sns.countplot(x='day_of_week', data=credit_card_df_no_outliers, hue='is_fraud', palette='Set1')
plt.title('Number of Transactions and Fraud Rates by Day of Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Number of Transactions')
plt.show()

# Plot 4: Seasonal patterns in fraud
plt.figure(figsize=(12, 8))
sns.countplot(x='transaction_month', data=credit_card_df_no_outliers, hue='is_fraud', palette='coolwarm')
plt.title('Number of Transactions and Fraud Rates by Month')
plt.xlabel('Transaction Month')
plt.ylabel('Number of Transactions')
plt.show()


## Transaction Characteristics analysis

In [None]:
top_fraudulent_merchants = credit_card_df_no_outliers[credit_card_df_no_outliers['is_fraud'] 
                                                      == 1]['merchant'].value_counts().head(10).index

# Filter the DataFrame for the top 10 fraudulent merchants
top_fraudulent_df = credit_card_df_no_outliers[credit_card_df_no_outliers['merchant'].isin(top_fraudulent_merchants)]

# Set the style for seaborn plots
sns.set(style="whitegrid")

# Plot 1: Merchants and categories involved in fraud for top 10 fraudulent merchants
plt.figure(figsize=(16, 8))
ax = sns.countplot(x='merchant', data=top_fraudulent_df, hue='is_fraud', palette='viridis')

# Add data labels inside the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10, color='black')

plt.title('Number of Transactions and Fraud Rates by Top 10 Fraudulent Merchants')
plt.xlabel('Merchant')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
# Set the style for seaborn plots
sns.set(style="whitegrid")


# Plot 1: Correlation between cardholder age and transaction amounts
plt.figure(figsize=(12, 8))
sns.scatterplot(x='age', y='amt', data=credit_card_df_no_outliers, hue='is_fraud', palette='coolwarm', alpha=0.5)
plt.title('Correlation between Cardholder Age and Transaction Amounts')
plt.xlabel('Cardholder Age')
plt.ylabel('Transaction Amount')
plt.legend(loc="upper right")
plt.legend(loc="best", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# Filter the DataFrame for fraudulent transactions
fraudulent_data = credit_card_df_no_outliers[credit_card_df_no_outliers['is_fraud'] == 1]

# Get the top 10 job roles based on median transaction amount for fraudulent transactions
top_10_fraudulent_jobs = fraudulent_data.groupby('job')['amt'].median().sort_values(ascending=False).head(10).index

# Filter the data for the top 10 fraudulent job roles
top_10_fraudulent_data = credit_card_df_no_outliers[credit_card_df_no_outliers['job'].isin(top_10_fraudulent_jobs)]

# Plot the boxplot
plt.figure(figsize=(16, 8))
sns.boxplot(x='job', y='amt', data=top_10_fraudulent_data, hue='is_fraud', palette='Pastel1')
plt.title('Transaction Amounts and Fraud by Top 10 Job Roles')
plt.xlabel('Job Role')
plt.ylabel('Transaction Amount')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
credit_card_df_no_outliers.info()

## Apply ML Algorithms

In [None]:
# Separate the data into majority and minority classes
majority_class = credit_card_df_no_outliers[credit_card_df_no_outliers['is_fraud'] == 0]
minority_class = credit_card_df_no_outliers[credit_card_df_no_outliers['is_fraud'] == 1]

# Define the desired number of rows for each class
desired_rows_majority = len(minority_class) * 2  # Assuming you want a 2:1 ratio

# Downsample the majority class to match the desired number of rows
downsampled_majority_class = majority_class.sample(n=desired_rows_majority, random_state=42)

# Concatenate the downsampled majority class with the minority class
new_balanced_credit_card_df = pd.concat([downsampled_majority_class, minority_class])

# Shuffle the new DataFrame
new_balanced_credit_card_df = new_balanced_credit_card_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Print the distribution of the target variable in the new balanced dataset
print("Target Distribution (New Balanced Dataset):")
print(new_balanced_credit_card_df['is_fraud'].value_counts())



In [None]:
# Select only numeric columns
numeric_columns = new_balanced_credit_card_df.select_dtypes(include='number')

# Calculate correlation matrix
correlation_matrix = numeric_columns.corr()

# Print correlation values
print("Correlation Matrix:")
print(correlation_matrix)

# Create a heatmap for visualization
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Data Correlation Matrix')
plt.show()


In [None]:
new_balanced_credit_card_df.info()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Assuming 'new_balanced_credit_card_df' is our DataFrame
target_column = 'is_fraud'

# Feature engineering for datetime columns
new_balanced_credit_card_df['trans_hour'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.hour
new_balanced_credit_card_df['trans_day'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.day
new_balanced_credit_card_df['trans_month'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.month
new_balanced_credit_card_df['dob_year'] = new_balanced_credit_card_df['dob'].dt.year

# Extract features and target variable
X = new_balanced_credit_card_df.drop([target_column, 'trans_date_trans_time', 'dob', 'unix_time'], axis=1)
y = new_balanced_credit_card_df[target_column]

# Convert categorical columns to numerical using Label Encoding
le = LabelEncoder()
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    X[col] = le.fit_transform(X[col])

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_classifier.fit(X, y)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print or display the feature importance DataFrame
print(feature_importance_df)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()


## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

# Features and target variable
X = new_balanced_credit_card_df.drop(columns=['is_fraud'])
y = new_balanced_credit_card_df['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define numerical and categorical features
numeric_features = ['zip', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt', 'transaction_hour',
                    'day_of_week', 'transaction_month', 'distance_km', 'age']
categorical_features = ['gender', 'state', 'category', 'amount_category']

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create the pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', rf_classifier)])

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_rf_pipeline = grid_search.best_estimator_
# Make predictions
y_pred = best_rf_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'])

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

# Calculate ROC AUC
y_pred_proba = best_rf_pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

print(f"ROC AUC for RF: {roc_auc:.2f}")

# Plot ROC Curve
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Plot Confusion Matrix as a Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


## Support vector machine

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc


# Convert datetime features
new_balanced_credit_card_df['trans_date_trans_time'] = pd.to_datetime(new_balanced_credit_card_df['trans_date_trans_time'])
new_balanced_credit_card_df['dob'] = pd.to_datetime(new_balanced_credit_card_df['dob'])
new_balanced_credit_card_df['transaction_hour'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.hour
new_balanced_credit_card_df['day_of_week'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.dayofweek
new_balanced_credit_card_df['transaction_month'] = new_balanced_credit_card_df['trans_date_trans_time'].dt.month

# Selecting features
numeric_features = ['zip', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt', 'transaction_hour', 'day_of_week', 'transaction_month', 'distance_km', 'age']
categorical_features = ['gender', 'state', 'category', 'amount_category']

# Target variable
target = 'is_fraud'

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    new_balanced_credit_card_df.drop(columns=[target]),
    new_balanced_credit_card_df[target],
    test_size=0.2,
    random_state=42,
    stratify=new_balanced_credit_card_df[target]  # Ensures balanced distribution in train and test sets
)

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating the SVM model
svm_model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', SVC(probability=True))])

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto'],
}

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search_svm.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params_svm = grid_search_svm.best_params_
best_svm_model = grid_search_svm.best_estimator_

# Make predictions
y_pred_svm = best_svm_model.predict(X_test)

# Get predicted probabilities for ROC curve
y_probs_svm = best_svm_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm, target_names=['Not Fraud', 'Fraud'])

# Calculate ROC curve and AUC
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_probs_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)

# Print results
print("Best Hyperparameters for SVM:", best_params_svm)
print(f"Accuracy for SVM: {accuracy_svm:.2f}")
print("Confusion Matrix for SVM:")
print(conf_matrix_svm)
print("Classification Report for SVM:")
print(classification_rep_svm)
print(f"ROC AUC for SVM: {roc_auc_svm:.2f}")

# Plotting Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_svm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.title('Confusion Matrix for SVM')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plotting ROC Curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_svm, tpr_svm, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc_svm:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM')
plt.legend(loc='lower right')
plt.show()


## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc


# Create a new DataFrame for logistic regression
logreg_df = new_balanced_credit_card_df.copy()

# Convert the target variable to binary
logreg_df['is_fraud'] = logreg_df['is_fraud'].astype('bool')

# Convert datetime features
logreg_df['trans_date_trans_time'] = pd.to_datetime(logreg_df['trans_date_trans_time'])
logreg_df['dob'] = pd.to_datetime(logreg_df['dob'])
logreg_df['transaction_hour'] = logreg_df['trans_date_trans_time'].dt.hour
logreg_df['day_of_week'] = logreg_df['trans_date_trans_time'].dt.dayofweek
logreg_df['transaction_month'] = logreg_df['trans_date_trans_time'].dt.month

# Selecting features
numeric_features = ['zip', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt', 'transaction_hour',
                    'day_of_week', 'transaction_month', 'distance_km', 'age']
categorical_features = ['gender', 'state', 'category', 'amount_category']

# Target variable
target = 'is_fraud'

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    logreg_df.drop(columns=[target]),
    logreg_df[target],
    test_size=0.1,
    random_state=42,
    stratify=logreg_df[target]  # Ensures balanced distribution in train and test sets
)

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating the Logistic Regression model
logreg_model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000))])

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2']
}

# Initialize GridSearchCV
grid_search_logreg = GridSearchCV(logreg_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search_logreg.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params_logreg = grid_search_logreg.best_params_
best_logreg_model = grid_search_logreg.best_estimator_

# Make predictions
y_pred_logreg = best_logreg_model.predict(X_test)

# Evaluate the model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg, target_names=['Not Fraud', 'Fraud'])

# Calculate ROC curve and AUC
fpr_logreg, tpr_logreg, thresholds_logreg = roc_curve(y_test, best_logreg_model.predict_proba(X_test)[:, 1])
roc_auc_logreg = auc(fpr_logreg, tpr_logreg)

# Plotting the ROC Curve
plt.figure(figsize=(8, 8))
plt.plot(fpr_logreg, tpr_logreg, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc_logreg:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()

# Plotting the Confusion Matrix
plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix_logreg, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], 
            yticklabels=['Not Fraud', 'Fraud'])
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print("Best Hyperparameters for Logistic Regression:", best_params_logreg)
print(f"Accuracy for Logistic Regression: {accuracy_logreg:.2f}")
print("Confusion Matrix for Logistic Regression:")
print(conf_matrix_logreg)
print("Classification Report for Logistic Regression:")
print(classification_rep_logreg)
print(f"ROC AUC for Logistic Regression: {roc_auc_logreg:.2f}")


## Decision tree

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

# Create a new DataFrame for the decision tree
tree_df = new_balanced_credit_card_df.copy()

# Convert the target variable to binary
tree_df['is_fraud'] = tree_df['is_fraud'].astype('bool')

# Convert datetime features
tree_df['trans_date_trans_time'] = pd.to_datetime(tree_df['trans_date_trans_time'])
tree_df['dob'] = pd.to_datetime(tree_df['dob'])
tree_df['transaction_hour'] = tree_df['trans_date_trans_time'].dt.hour
tree_df['day_of_week'] = tree_df['trans_date_trans_time'].dt.dayofweek
tree_df['transaction_month'] = tree_df['trans_date_trans_time'].dt.month

# Selecting features
numeric_features = ['zip', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt', 'transaction_hour', 'day_of_week', 'transaction_month', 'distance_km', 'age']
categorical_features = ['gender', 'state', 'category', 'amount_category']

# Target variable
target = 'is_fraud'

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    tree_df.drop(columns=[target]),
    tree_df[target],
    test_size=0.1,
    random_state=42,
    stratify=tree_df[target]  # Ensures balanced distribution in train and test sets
)

# Custom transformer for selecting numeric or categorical features
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.feature_names]

# Creating and training the Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)

# Creating the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', FeatureSelector(numeric_features), numeric_features),
            ('cat', Pipeline(steps=[
                ('selector', FeatureSelector(categorical_features)),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ])),
    ('classifier', tree_model)
])

# Hyperparameter tuning using GridSearchCV
param_grid = {'classifier__max_depth': [None, 5, 10, 15], 'classifier__min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_max_depth = grid_search.best_params_['classifier__max_depth']
best_min_samples_split = grid_search.best_params_['classifier__min_samples_split']
print(f"Best Max Depth: {best_max_depth}")
print(f"Best Min Samples Split: {best_min_samples_split}")

# Create and train the final Decision Tree model with the best hyperparameters
final_tree_model = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', FeatureSelector(numeric_features), numeric_features),
            ('cat', Pipeline(steps=[
                ('selector', FeatureSelector(categorical_features)),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ])),
    ('classifier', DecisionTreeClassifier(
        max_depth=best_max_depth,
        min_samples_split=best_min_samples_split,
        random_state=42))
])
final_tree_model.fit(X_train, y_train)

# Make predictions
y_pred = final_tree_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud'])

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, final_tree_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Plot Confusion Matrix with values
plt.figure(figsize=(8, 8))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

# Add values in each cell
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')

plt.xticks([0, 1], ['Predicted Not Fraud', 'Predicted Fraud'])
plt.yticks([0, 1], ['Actual Not Fraud', 'Actual Fraud'])
plt.xlabel('True label')
plt.ylabel('Predicted label')
plt.show()

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)
print(f"ROC AUC: {roc_auc:.2f}")


## Neural Network

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Features and target variable
X = new_balanced_credit_card_df.drop(columns=['is_fraud'])
y = new_balanced_credit_card_df['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define numerical and categorical features
numeric_features = ['zip', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop', 'cc_num', 'amt', 'transaction_hour',
                    'day_of_week', 'transaction_month', 'distance_km', 'age']
categorical_features = ['gender', 'state', 'category', 'amount_category']

# Preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to training and testing sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Assuming X_train_preprocessed is your sparse matrix
X_train_preprocessed_np = X_train_preprocessed.toarray()

# Build a simple neural network
model = Sequential()
model.add(Dense(64, input_dim=X_train_preprocessed_np.shape[1], activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
model.fit(X_train_preprocessed_np, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
X_test_preprocessed_np = X_test_preprocessed.toarray()
accuracy = model.evaluate(X_test_preprocessed_np, y_test)[1]
print(f"Test Accuracy: {accuracy:.2f}")

# Make predictions on the test set
y_pred = model.predict(X_test_preprocessed_np)
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)
classification_rep = classification_report(y_test, y_pred_binary, target_names=['Not Fraud', 'Fraud'])

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Plot Confusion Matrix with values
plt.figure(figsize=(8, 8))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

# Add values in each cell
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')

plt.xticks([0, 1], ['Predicted Not Fraud', 'Predicted Fraud'])
plt.yticks([0, 1], ['Actual Not Fraud', 'Actual Fraud'])
plt.xlabel('True label')
plt.ylabel('Predicted label')
plt.show()

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)
print(f"ROC AUC: {roc_auc:.2f}")

KeyboardInterrupt: 