In [1]:
import pandas as pd

In [2]:
# Load the dataset into a Pandas DataFrame and display the first 5 rows.
# read dataset csv
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Type,Merchant,Category,Amount,Payment_Mode,Transaction_Status,Location
0,100000.0,4452,2023-01-01,Online,Walmart,Travel,4520.7,Debit Card,Approved,Jonesport
1,100001.0,2775,2023-01-01,ATM,BestBuy,Travel,1437.85,Debit Card,Approved,Port Jennifer
2,100002.0,2259,2023-01-01,Mobile Payment,Uber,Clothing,3320.52,PayPal,Approved,Port James
3,100003.0,4545,2023-01-01,Online,BestBuy,Travel,2659.96,Debit Card,Approved,Hawkinston
4,100004.0,2137,2023-01-01,ATM,Amazon,Travel,2517.07,Debit Card,Approved,Matthewland


In [3]:
# Check the shape, column names, and summary statistics of the dataset.
print(df.shape)
print(df.describe()) # only int value

(5500, 10)
       Transaction_ID  Customer_ID       Amount
count     5499.000000  5500.000000  5498.000000
mean    102749.998727  2989.954182  2492.286513
std       1587.571099  1153.928878  1444.710273
min     100000.000000  1000.000000     6.550000
25%     101375.500000  1988.000000  1236.765000
50%     102750.000000  3000.000000  2503.355000
75%     104124.500000  3990.000000  3768.332500
max     105499.000000  4998.000000  4997.490000


In [4]:
# print columns name 
print(df.columns.tolist())

['Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Type', 'Merchant', 'Category', 'Amount', 'Payment_Mode', 'Transaction_Status', 'Location']


In [5]:
# Identify and handle missing values (fill or drop based on the data type).
print(df.isnull().sum())

Transaction_ID        1
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              4
Amount                2
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64


In [6]:
# handel missing value
print(df.dtypes)

Transaction_ID        float64
Customer_ID             int64
Transaction_Date       object
Transaction_Type       object
Merchant               object
Category               object
Amount                float64
Payment_Mode           object
Transaction_Status     object
Location               object
dtype: object


In [12]:
num_cols = ['Customer_ID','Amount','Transaction_ID']
categorical_col = ['Transaction_Type','Merchant','Category','Payment_Mode','Transaction_Status','Location']

for col in num_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)


In [13]:
print(df.isnull().sum())

Transaction_ID        0
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              4
Amount                0
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64


In [15]:
for col in categorical_col:
    if df[col].isnull().any():
        df[col].fillna("unknown",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("unknown",inplace=True)


In [16]:
print(df.isnull().sum())

Transaction_ID        0
Customer_ID           0
Transaction_Date      0
Transaction_Type      0
Merchant              0
Category              0
Amount                0
Payment_Mode          0
Transaction_Status    0
Location              0
dtype: int64


In [17]:
# delete row missing value > 50%
df = df.dropna(thresh=len(df)*0.5,axis=1)


In [19]:
df.dtypes

Transaction_ID        float64
Customer_ID             int64
Transaction_Date       object
Transaction_Type       object
Merchant               object
Category               object
Amount                float64
Payment_Mode           object
Transaction_Status     object
Location               object
dtype: object

In [20]:
# Convert Transaction_Date into datetime format and extract year, month, and day as new columns.
# convert date 
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])

In [21]:
df.dtypes

Transaction_ID               float64
Customer_ID                    int64
Transaction_Date      datetime64[ns]
Transaction_Type              object
Merchant                      object
Category                      object
Amount                       float64
Payment_Mode                  object
Transaction_Status            object
Location                      object
dtype: object

In [22]:
# year , month , day
df['Year'] = df['Transaction_Date'].dt.year
df['Month'] = df['Transaction_Date'].dt.month
df['Day'] = df['Transaction_Date'].dt.day

In [23]:
df.head()

Unnamed: 0,Transaction_ID,Customer_ID,Transaction_Date,Transaction_Type,Merchant,Category,Amount,Payment_Mode,Transaction_Status,Location,Year,Month,Day
0,100000.0,4452,2023-01-01,Online,Walmart,Travel,4520.7,Debit Card,Approved,Jonesport,2023,1,1
1,100001.0,2775,2023-01-01,ATM,BestBuy,Travel,1437.85,Debit Card,Approved,Port Jennifer,2023,1,1
2,100002.0,2259,2023-01-01,Mobile Payment,Uber,Clothing,3320.52,PayPal,Approved,Port James,2023,1,1
3,100003.0,4545,2023-01-01,Online,BestBuy,Travel,2659.96,Debit Card,Approved,Hawkinston,2023,1,1
4,100004.0,2137,2023-01-01,ATM,Amazon,Travel,2517.07,Debit Card,Approved,Matthewland,2023,1,1


In [29]:
# Retrieve all transactions made in January 2024.
jan_2023_transactions = df[(df['Year']== 2023) & (df['Month'] == 1)]
print(jan_2023_transactions)
jan_2023_transactions.to_csv("jan_2023_transactions.csv")

     Transaction_ID  Customer_ID Transaction_Date Transaction_Type  \
0          100000.0         4452       2023-01-01           Online   
1          100001.0         2775       2023-01-01              ATM   
2          100002.0         2259       2023-01-01   Mobile Payment   
3          100003.0         4545       2023-01-01           Online   
4          100004.0         2137       2023-01-01              ATM   
..              ...          ...              ...              ...   
739        100739.0         3898       2023-01-31              POS   
740        100740.0         2457       2023-01-31   Mobile Payment   
741        100741.0         2107       2023-01-31   Mobile Payment   
742        100742.0         2678       2023-01-31              POS   
743        100743.0         4586       2023-01-31              ATM   

        Merchant     Category   Amount Payment_Mode Transaction_Status  \
0        Walmart       Travel  4520.70   Debit Card           Approved   
1        Be

In [31]:
# Find transactions where Amount > 4000 and Transaction_Type is "Online".
high_online_transactions = df[(df['Amount'] > 4000) & (df['Transaction_Type']== "Online")]
print(high_online_transactions)

      Transaction_ID  Customer_ID Transaction_Date Transaction_Type  \
0           100000.0         4452       2023-01-01           Online   
22          100022.0         3057       2023-01-01           Online   
26          100026.0         4254       2023-01-02           Online   
27          100027.0         1719       2023-01-02           Online   
34          100034.0         2424       2023-01-02           Online   
...              ...          ...              ...              ...   
5326        105326.0         1662       2023-08-10           Online   
5341        105341.0         2011       2023-08-11           Online   
5395        105395.0         3551       2023-08-13           Online   
5446        105446.0         4468       2023-08-15           Online   
5461        105461.0         1140       2023-08-16           Online   

         Merchant       Category   Amount Payment_Mode Transaction_Status  \
0         Walmart         Travel  4520.70   Debit Card           Appro