In [1]:
# Data Exploration and Cleaning
# Load the dataset into a Pandas DataFrame and display the first 5 rows.
# Check the shape, column names, and summary statistics of the dataset.
# Identify and handle missing values (fill or drop based on the data type).
# Convert Transaction_Date into datetime format and extract year, month, and day as new columns.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# add new plot look theme
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) # fixed figure size for all plots

In [3]:
# Load the dataset into a Pandas DataFrame and display the first 5 rows.
data = pd.read_csv('credit_card_transactions.csv')
print(data.head())

   Transaction_ID  Customer_ID Transaction_Date Transaction_Type Merchant  \
0               1          103       2025-01-01              ATM  BestBuy   
1               2          271       2025-01-01           Online     eBay   
2               3          107       2025-01-01              ATM   Amazon   
3               4           72       2025-01-01              POS   Amazon   
4               5          189       2025-01-01              ATM   Amazon   

        Category  Amount Payment_Mode Transaction_Status       Location  
0         Dining  360.10  Credit Card            Pending  San Francisco  
1         Travel  357.02  Credit Card           Declined        Chicago  
2  Entertainment  829.41  Credit Card            Pending        Chicago  
3      Groceries  790.35  Credit Card            Pending    Los Angeles  
4       Clothing  311.26  Credit Card           Declined  San Francisco  


In [4]:
# Check the shape, column names, and summary statistics of the dataset.
print("Shape of the dataset:", data.shape)
print("Column names:", data.columns.tolist())
print("Summary statistics:\n", data.describe())

Shape of the dataset: (1200, 10)
Column names: ['Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Type', 'Merchant', 'Category', 'Amount', 'Payment_Mode', 'Transaction_Status', 'Location']
Summary statistics:
        Transaction_ID  Customer_ID       Amount
count     1200.000000  1200.000000  1200.000000
mean       600.500000   150.931667   494.489692
std        346.554469    86.144962   283.821837
min          1.000000     1.000000     5.030000
25%        300.750000    78.000000   243.565000
50%        600.500000   150.500000   495.090000
75%        900.250000   225.250000   734.625000
max       1200.000000   300.000000   999.560000


In [5]:
# Identify and handle missing values (fill or drop based on the data type).
# check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Transaction_ID        0
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              0
Amount                0
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64


In [6]:
# check data types
print("Data types of each column:\n", data.dtypes)

Data types of each column:
 Transaction_ID          int64
Customer_ID             int64
Transaction_Date       object
Transaction_Type       object
Merchant               object
Category               object
Amount                float64
Payment_Mode           object
Transaction_Status     object
Location               object
dtype: object


In [7]:
# change Transaction_Date to datetime
data['Transaction_Date'] = pd.to_datetime(data['Transaction_Date'])

In [8]:
# check data types
print("Data types of each column:\n", data.dtypes)

Data types of each column:
 Transaction_ID                 int64
Customer_ID                    int64
Transaction_Date      datetime64[ns]
Transaction_Type              object
Merchant                      object
Category                      object
Amount                       float64
Payment_Mode                  object
Transaction_Status            object
Location                      object
dtype: object


In [9]:
data.columns

Index(['Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Type',
       'Merchant', 'Category', 'Amount', 'Payment_Mode', 'Transaction_Status',
       'Location'],
      dtype='object')

In [10]:
# add new columns for year, month, day
data['Year'] = data['Transaction_Date'].dt.year
data['Month'] = data['Transaction_Date'].dt.month
data['Day'] = data['Transaction_Date'].dt.day

In [11]:
print(data[['Transaction_Date','Year','Month','Day']].head())

  Transaction_Date  Year  Month  Day
0       2025-01-01  2025      1    1
1       2025-01-01  2025      1    1
2       2025-01-01  2025      1    1
3       2025-01-01  2025      1    1
4       2025-01-01  2025      1    1


In [12]:
# Retrieve all transactions made in January 2025.
jan_2025 = data[(data['Year'] == 2025) & (data['Month'] == 1)]
jan_2025.head()
# export jan_2025 to csv
jan_2025.to_csv('january_2025_transactions.csv', index=False)

In [13]:
# Find transactions where Amount > 700 and Transaction_Type is "Online".
online_transactions = data[(data['Amount'] > 700) & (data['Transaction_Type'] == "Online")]

In [14]:
# Select only Approved transactions from the dataset.
approved_transactions = data[data['Transaction_Status'] == 'Approved']
# give me number of rows count where Status is Approved
approved_transactions = data[data['Transaction_Status'] == 'Approved']
print("Number of Approved transactions:", len(approved_transactions))

Number of Approved transactions: 399


In [15]:
# Create a new column Discounted_Amount, assuming a 5% discount on all transactions above 500.
data['Discounted_Amount'] = data['Amount']

mask = data['Amount'] > 500

data.loc[mask, 'Discounted_Amount'] = data.loc[mask, 'Amount'] * 0.95

print(data[['Amount', 'Discounted_Amount']].head())

   Amount  Discounted_Amount
0  360.10           360.1000
1  357.02           357.0200
2  829.41           787.9395
3  790.35           750.8325
4  311.26           311.2600


In [16]:
# amount category 
# 1 low -> Below $100
# 2 medium -> Between $100 - $500
# 3 high -> Above $500

def amount_cat(x):
    if x < 100:
        return "Low"
    elif x <= 500:
        return "Medium"
    else:
        return "High"       
    
data['Amount_Category']  = data['Amount'].apply(amount_cat)
print(data[['Amount','Discounted_Amount','Amount_Category']].head())

   Amount  Discounted_Amount Amount_Category
0  360.10           360.1000          Medium
1  357.02           357.0200          Medium
2  829.41           787.9395            High
3  790.35           750.8325            High
4  311.26           311.2600          Medium


In [17]:
# total amount per category 
total_cat = data.groupby('Category')['Amount'].sum()
print("Total Amount per Category:\n", total_cat)

Total Amount per Category:
 Category
Clothing         90207.10
Dining           85015.63
Electronics      76672.25
Entertainment    78467.94
Groceries        90815.72
Health           85100.25
Travel           87108.74
Name: Amount, dtype: float64


In [18]:
# Determine the number of declined transactions per Payment_Mode.
declined = data[data['Transaction_Status'] == 'Declined']
declined_by_mode = declined['Payment_Mode'].value_counts()
print("Number of Declined transactions per Payment_Mode:\n", declined_by_mode)

Number of Declined transactions per Payment_Mode:
 Payment_Mode
Debit Card     216
Credit Card    200
Name: count, dtype: int64


In [19]:
# Identify the top 5 most frequent merchants based on transaction count.
top_merchants = data['Merchant'].value_counts().head(5)
print("Top 5 most frequent merchants:\n", top_merchants)

Top 5 most frequent merchants:
 Merchant
BestBuy    192
Target     179
eBay       177
Expedia    173
Amazon     171
Name: count, dtype: int64


In [20]:
# check customer info dataset 
customer_info = pd.read_csv('customer_info.csv')
# print(customer_info.head())

# Fraud Detection Indicators
# Find customers who made more than 10 transactions in a single day (potential fraud).
# step no 1 create a new column with date only from Transaction_Date
data['Date'] = data['Transaction_Date'].dt.date
# print(data['Date'].head())

# step no 2 group by Customer_ID and Date and count number of transactions
daily = data.groupby(['Customer_ID','Date']).size()
# print(daily.head(10))

# step no 3 filter customers with more than 10 transactions in a day
month_per_day = daily[daily > 2].reset_index()
print("Customers with more than 10 transactions in a single day:\n", month_per_day)

Customers with more than 10 transactions in a single day:
    Customer_ID        Date  0
0          104  2025-02-06  3
1          113  2025-01-07  3
2          157  2025-02-07  4
3          173  2025-01-23  3


In [21]:
data.head()

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Type,Merchant,Category,Amount,Payment_Mode,Transaction_Status,Location,Year,Month,Day,Discounted_Amount,Amount_Category,Date
0,1,103,2025-01-01,ATM,BestBuy,Dining,360.1,Credit Card,Pending,San Francisco,2025,1,1,360.1,Medium,2025-01-01
1,2,271,2025-01-01,Online,eBay,Travel,357.02,Credit Card,Declined,Chicago,2025,1,1,357.02,Medium,2025-01-01
2,3,107,2025-01-01,ATM,Amazon,Entertainment,829.41,Credit Card,Pending,Chicago,2025,1,1,787.9395,High,2025-01-01
3,4,72,2025-01-01,POS,Amazon,Groceries,790.35,Credit Card,Pending,Los Angeles,2025,1,1,750.8325,High,2025-01-01
4,5,189,2025-01-01,ATM,Amazon,Clothing,311.26,Credit Card,Declined,San Francisco,2025,1,1,311.26,Medium,2025-01-01


In [22]:
# Find transactions where Amount > 800 and Transaction_Type is Online (flag as high-risk)
data["High_Risk"] = (data['Amount'] > 800) & (data['Transaction_Type'] == 'Online')
print(data[['Amount','Transaction_Type','High_Risk']].head())

   Amount Transaction_Type  High_Risk
0  360.10              ATM      False
1  357.02           Online      False
2  829.41              ATM      False
3  790.35              POS      False
4  311.26              ATM      False


In [25]:
# count 
print("Number of High Risk transactions:", data['High_Risk'].sum())

Number of High Risk transactions: 78
