In [8]:
%pip install scikit-learn
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def process_transaction(transaction_ref):
    # Split string using /
    parts = str(transaction_ref).split("/")
    
    # Extract Payment Type (first token, lowercase)
    payment_type = parts[0].lower()
    
    # Extract Merchant (index 3, lowercase, stripped)
    merchant = None
    if len(parts) > 3:
        merchant = parts[3].lower().strip()
    
    # Categorize Merchant using keyword matching
    category = "others"
    if merchant:
        if any(kw in merchant for kw in ['swiggy', 'zomato', 'restaurant', 'pizza', 'kfc', 'mcdonalds']):
            category = "food"
        elif any(kw in merchant for kw in ['netflix', 'spotify', 'prime', 'hotstar', 'multiplex', 'cinema']):
            category = "entertainment"
        elif any(kw in merchant for kw in ['udemy', 'coursera', 'byjus', 'edx', 'school', 'college']):
            category = "education"
        elif any(kw in merchant for kw in ['uber', 'ola', 'rapido', 'irctc', 'indigo', 'airindia', 'railway']):
            category = "travel"
        elif any(kw in merchant for kw in ['amazon', 'flipkart', 'myntra', 'dmart', 'bigbazaar', 'ajio', 'reliance']):
            category = "shopping"
        elif any(kw in merchant for kw in ['electricity', 'water', 'gas', 'bill', 'bescom', 'recharge']):
            category = "utilities"
        elif any(kw in merchant for kw in ['salary', 'payroll', 'stipend']):
            category = "salary"
        elif any(kw in merchant for kw in ['apollo', 'pharmeasy', 'hospital', 'medical', 'pharmacy', 'health']):
            category = "medical"
        elif any(kw in merchant for kw in ['jio', 'vi', 'airtel', 'bsnl', 'telecom']):
            category = "recharge"
            
    return payment_type, category

# 1. Load the dataset
df = pd.read_csv('./data/RawDataset.csv')

# 2. Add payment_type and category columns
source_col = 'Description' if 'Description' in df.columns else 'Transaction Reference'
df[['payment_type', 'category']] = df.apply(lambda x: process_transaction(x[source_col]), axis=1, result_type='expand')

# 3. Convert Value Date to datetime and extract features
df['Value Date'] = pd.to_datetime(df['Value Date'], dayfirst=True)
df['day'] = df['Value Date'].dt.day
df['month'] = df['Value Date'].dt.month
df['year'] = df['Value Date'].dt.year

# 4. Remove specified columns
# Columns to remove: Txn Date, Description (source_col), Balance
cols_to_drop = ['Txn Date', source_col, 'Balance']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# 5. Convert payment_type and category to numeric
le_payment = LabelEncoder()
le_category = LabelEncoder()

df['payment_type'] = le_payment.fit_transform(df['payment_type'])
df['category'] = le_category.fit_transform(df['category'])

# Fill remaining NA values with 0
df.fillna(0, inplace=True)

# Show the first few rows of the processed dataset
print(df.head())



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
  Value Date          Ref No./Cheque No.    Debit    Credit  payment_type  \
0 2022-01-01  TRANSFER FROM 952735718030     0.00  125213.0             1   
1 2022-01-01    TRANSFER TO 711995879155  1665.43       0.0             2   
2 2022-01-01    TRANSFER TO 821490367535  1563.68       0.0             0   
3 2022-01-01    TRANSFER TO 183967131867  3419.58       0.0             1   
4 2022-01-01    TRANSFER TO 858057510226  6077.98       0.0             3   

   category  day  month  year  
0         4    1      1  2022  
1         5    1      1  2022  
2         5    1      1  2022  
3         5    1      1  2022  
4         5    1      1  2022  


In [14]:
df.drop(columns=["Ref No./Cheque No.","Value Date"], errors="ignore", inplace=True)

# Select numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Apply MinMax scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [15]:
df.head(30)

Unnamed: 0,Debit,Credit,payment_type,category,day,month,year
0,0.0,0.196161,0.333333,0.666667,1,1,2022
1,0.001808,0.0,0.666667,0.833333,1,1,2022
2,0.001697,0.0,0.0,0.833333,1,1,2022
3,0.003712,0.0,0.333333,0.833333,1,1,2022
4,0.006598,0.0,1.0,0.833333,1,1,2022
5,0.081414,0.0,0.666667,0.0,1,1,2022
6,0.081414,0.0,1.0,0.166667,1,1,2022
7,0.081414,0.0,1.0,0.333333,1,1,2022
8,0.002322,0.0,0.333333,0.0,1,1,2022
9,0.002152,0.0,1.0,0.333333,1,1,2022
