In [18]:
import pandas as pd
# load dataset 
df = pd.read_csv('credit_card_transactions.csv')
print(df.head()) # print top 5 row

   Transaction_ID  Customer_ID Transaction_Date Transaction_Type Merchant  \
0          100000         4452       2023-01-01           Online  Walmart   
1          100001         2775       2023-01-01              ATM  BestBuy   
2          100002         2259       2023-01-01   Mobile Payment     Uber   
3          100003         4545       2023-01-01           Online  BestBuy   
4          100004         2137       2023-01-01              ATM   Amazon   

   Category   Amount Payment_Mode Transaction_Status       Location  
0    Travel  4520.70   Debit Card           Approved      Jonesport  
1    Travel  1437.85   Debit Card           Approved  Port Jennifer  
2  Clothing  3320.52       PayPal           Approved     Port James  
3    Travel  2659.96   Debit Card           Approved     Hawkinston  
4    Travel  2517.07   Debit Card           Approved    Matthewland  


In [19]:
# Shape 
print(df.shape)

(5500, 10)


In [20]:
# Column Name 
df.columns.to_list()

['Transaction_ID',
 'Customer_ID',
 'Transaction_Date',
 'Transaction_Type',
 'Merchant',
 'Category',
 'Amount',
 'Payment_Mode',
 'Transaction_Status',
 'Location']

In [21]:
# summary
df.describe()

Unnamed: 0,Transaction_ID,Customer_ID,Amount
count,5500.0,5500.0,5500.0
mean,102749.5,2989.954182,2492.109224
std,1587.857571,1153.928878,1444.493842
min,100000.0,1000.0,6.55
25%,101374.75,1988.0,1236.935
50%,102749.5,3000.0,2502.405
75%,104124.25,3990.0,3767.9575
max,105499.0,4998.0,4997.49


In [22]:
# check missing values 
df.isnull().sum()

Transaction_ID        0
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              0
Amount                0
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64

In [23]:
# handle missing values = 'Unknown'
df['Merchant'] = df['Merchant'].fillna('Unknown')
# check again missing values 
df.isnull().sum()

Transaction_ID        0
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              0
Amount                0
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64

In [24]:
# check dtype
df.dtypes

Transaction_ID          int64
Customer_ID             int64
Transaction_Date       object
Transaction_Type       object
Merchant               object
Category               object
Amount                float64
Payment_Mode           object
Transaction_Status     object
Location               object
dtype: object

In [25]:
# convert to datetime
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])
df.dtypes

Transaction_ID                 int64
Customer_ID                    int64
Transaction_Date      datetime64[ns]
Transaction_Type              object
Merchant                      object
Category                      object
Amount                       float64
Payment_Mode                  object
Transaction_Status            object
Location                      object
dtype: object

In [26]:
df.columns.to_list()

['Transaction_ID',
 'Customer_ID',
 'Transaction_Date',
 'Transaction_Type',
 'Merchant',
 'Category',
 'Amount',
 'Payment_Mode',
 'Transaction_Status',
 'Location']

In [27]:
# year , month , day
df['year'] = df['Transaction_Date'].dt.year
df['month'] = df['Transaction_Date'].dt.month
df['day'] = df['Transaction_Date'].dt.day

In [28]:
# check columns
df.columns.to_list()

['Transaction_ID',
 'Customer_ID',
 'Transaction_Date',
 'Transaction_Type',
 'Merchant',
 'Category',
 'Amount',
 'Payment_Mode',
 'Transaction_Status',
 'Location',
 'year',
 'month',
 'day']

In [29]:
df.head()

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Type,Merchant,Category,Amount,Payment_Mode,Transaction_Status,Location,year,month,day
0,100000,4452,2023-01-01,Online,Walmart,Travel,4520.7,Debit Card,Approved,Jonesport,2023,1,1
1,100001,2775,2023-01-01,ATM,BestBuy,Travel,1437.85,Debit Card,Approved,Port Jennifer,2023,1,1
2,100002,2259,2023-01-01,Mobile Payment,Uber,Clothing,3320.52,PayPal,Approved,Port James,2023,1,1
3,100003,4545,2023-01-01,Online,BestBuy,Travel,2659.96,Debit Card,Approved,Hawkinston,2023,1,1
4,100004,2137,2023-01-01,ATM,Amazon,Travel,2517.07,Debit Card,Approved,Matthewland,2023,1,1


In [30]:
# Retrieve all transactions made in January 2023.
jan_2023 = df[(df['year'] == 2023) & (df['month'] == 1) ]
print(jan_2023)
df.to_csv('jan_2023.csv')

     Transaction_ID  Customer_ID Transaction_Date Transaction_Type  \
0            100000         4452       2023-01-01           Online   
1            100001         2775       2023-01-01              ATM   
2            100002         2259       2023-01-01   Mobile Payment   
3            100003         4545       2023-01-01           Online   
4            100004         2137       2023-01-01              ATM   
..              ...          ...              ...              ...   
739          100739         3898       2023-01-31              POS   
740          100740         2457       2023-01-31   Mobile Payment   
741          100741         2107       2023-01-31   Mobile Payment   
742          100742         2678       2023-01-31              POS   
743          100743         4586       2023-01-31              ATM   

        Merchant     Category   Amount Payment_Mode Transaction_Status  \
0        Walmart       Travel  4520.70   Debit Card           Approved   
1        Be

In [31]:
# Find transactions where Amount > 1000 and Transaction_Type is "Online".

high_online = df[(df['Amount'] > 1000) & (df['Transaction_Type'] == 'Online')]
high_online.to_csv('high_online.csv')

In [32]:
# Select only Approved transactions from the dataset.

approved = df[df['Transaction_Status'] == 'Approved']
approved.to_csv('approved.csv')

In [33]:
# Create a new column Discounted_Amount, assuming a 5% discount on all transactions above $2000

df['Discounted_Amount'] = df['Amount'].apply(lambda x : x * 0.95 if x > 2000 else x)

df[['Discounted_Amount','Amount']]

Unnamed: 0,Discounted_Amount,Amount
0,4294.6650,4520.70
1,1437.8500,1437.85
2,3154.4940,3320.52
3,2526.9620,2659.96
4,2391.2165,2517.07
...,...,...
5495,4327.0790,4554.82
5496,3912.5845,4118.51
5497,2832.0735,2981.13
5498,2059.9705,2168.39


In [34]:
data_d = df[df['Amount'] > 2000]
print(data_d)

      Transaction_ID  Customer_ID Transaction_Date Transaction_Type  \
0             100000         4452       2023-01-01           Online   
2             100002         2259       2023-01-01   Mobile Payment   
3             100003         4545       2023-01-01           Online   
4             100004         2137       2023-01-01              ATM   
5             100005         4096       2023-01-01              ATM   
...              ...          ...              ...              ...   
5491          105491         3258       2023-08-17   Mobile Payment   
5495          105495         3262       2023-08-17              ATM   
5496          105496         1234       2023-08-18              POS   
5497          105497         3978       2023-08-18           Online   
5498          105498         3082       2023-08-18              ATM   

         Merchant   Category   Amount Payment_Mode Transaction_Status  \
0         Walmart     Travel  4520.70   Debit Card           Approved   
2

In [39]:
# Categorize the Transaction_Amount into Low, Medium, and High based on:
# Low: Below $1000
# Medium: Between $1000 - $5000
# High: Above $5000

def categorize_amount(amount):
    if amount < 1500:
        return "low"
    elif 1501 <= amount <= 3000:
        return "Medium"
    else:
        return "High"
    

df['Amount_category'] = df['Amount'].apply(categorize_amount)    

In [40]:
df.head()

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Type,Merchant,Category,Amount,Payment_Mode,Transaction_Status,Location,year,month,day,Discounted_Amount,Amount_category
0,100000,4452,2023-01-01,Online,Walmart,Travel,4520.7,Debit Card,Approved,Jonesport,2023,1,1,4294.665,High
1,100001,2775,2023-01-01,ATM,BestBuy,Travel,1437.85,Debit Card,Approved,Port Jennifer,2023,1,1,1437.85,low
2,100002,2259,2023-01-01,Mobile Payment,Uber,Clothing,3320.52,PayPal,Approved,Port James,2023,1,1,3154.494,High
3,100003,4545,2023-01-01,Online,BestBuy,Travel,2659.96,Debit Card,Approved,Hawkinston,2023,1,1,2526.962,Medium
4,100004,2137,2023-01-01,ATM,Amazon,Travel,2517.07,Debit Card,Approved,Matthewland,2023,1,1,2391.2165,Medium


In [43]:
# Drop the Merchant column if more than 30% of values are missing.
# missing_percent = df['Merchant'].isnull.mean()

In [44]:
# Aggregation and Insights
# Find the total transaction amount per Category.

total_per_cat = df.groupby('Category')['Amount'].sum().reset_index()
print(total_per_cat)

        Category      Amount
0       Clothing  2185530.09
1         Dining  2402259.10
2    Electronics  2319437.34
3  Entertainment  2243778.43
4      Groceries  2386772.81
5         Travel  2168822.96


In [45]:
# Determine the number of declined transactions per Payment_Mode.
declined_per_mode = df[df['Transaction_Status'] == 'Declined'].groupby('Payment_Mode').size().reset_index(name='count')
print(declined_per_mode)

  Payment_Mode  count
0  Credit Card    117
1   Debit Card    145
2       PayPal    135
3          UPI    134


In [46]:
# Identify the top 5 most frequent merchants based on transaction count.
top_merchants = df['Merchant'].value_counts().head(5).reset_index(name='count')
print(top_merchants)

  Merchant  count
0     Uber    833
1  Walmart    818
2  BestBuy    790
3  Netflix    774
4   Amazon    770


In [49]:
# Find the average transaction amount per Location.
# avg_per_location = df.groupby('Location')['Amount'].mean().reset_index('average')
# print(avg_per_location)

In [53]:
# Find customers who made more than 10 transactions in a single day (potential fraud).

df['date_only'] = df['Transaction_Date'].dt.date

trans_per_day = df.groupby(['Customer_ID','date_only',]).size().reset_index(name='count')

# print(trans_per_day)

high_trans = trans_per_day[trans_per_day['count'] > 2]
print(high_trans)


Empty DataFrame
Columns: [Customer_ID, date_only, count]
Index: []
