# Advanced UPI Transactions Analysis & Forecasting

Comprehensive ML & DL pipeline on UPI transactions: EDA, feature engineering, classical ML (Linear Regression, Random Forest, XGBoost), and an optional LSTM time-series model.

In [None]:
# Cell 1 — Setup (Colab-ready)
!pip install -q xgboost joblib

import os, warnings, math
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
import xgboost as xgb

# For LSTM
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

print('Setup complete. Versions: pandas', pd.__version__, 'xgboost', xgb.__version__)

In [None]:
# Cell 2 — Load dataset (Upload or Drive)
from google.colab import files
print('Please upload the CSV file (e.g., upi-transactions-p2p-and-p2m.csv)')
uploaded = files.upload()
for fn in uploaded.keys():
    data_path = fn
print('Loaded:', data_path)

df = pd.read_csv(data_path)
df.head(10)

In [None]:
# Cell 3 — Data overview
print('Shape:', df.shape)
print('\nColumns:\n', df.columns)
df.columns = [c.strip() for c in df.columns]
display(df.info())
display(df.describe().T)

In [None]:
# Cell 4 — Preprocessing: parse month/date and clean numeric columns
# Assume there is a 'month' column as you confirmed
date_col = 'month'
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
df = df.sort_values(date_col).reset_index(drop=True)

possible_val_cols = [c for c in df.columns if 'val' in c.lower() or 'value' in c.lower() or 'amt' in c.lower() or 'amount' in c.lower() or 'total' in c.lower()]
for c in possible_val_cols:
    df[c] = pd.to_numeric(df[c].astype(str).str.replace(',','').str.replace('₹','').str.replace('Rs','').str.replace('rs',''), errors='coerce')

df = df.reset_index(drop=True)
print('Date column used:', date_col)
print('Potential value cols:', possible_val_cols)
df.head()

In [None]:
# Cell 5 — EDA & Visualizations
plt.style.use('seaborn')

total_cols = [c for c in df.columns if 'total' in c.lower() or 'value' in c.lower() or 'val' in c.lower()]
if len(total_cols)==0:
    total_col = df.select_dtypes(include=[np.number]).columns[0]
else:
    total_col = total_cols[0]

print('Using', total_col, 'for trend analysis')
plt.figure(figsize=(12,5))
plt.plot(df[date_col], df[total_col], marker='o')
plt.title(f'{total_col} over time')
plt.xlabel('Date'); plt.ylabel(total_col); plt.grid(True); plt.show()

p2p_cols = [c for c in df.columns if 'p2p' in c.lower()]
p2m_cols = [c for c in df.columns if 'p2m' in c.lower()]
if p2p_cols and p2m_cols:
    plt.figure(figsize=(12,5))
    plt.plot(df[date_col], df[p2p_cols[0]], label='p2p')
    plt.plot(df[date_col], df[p2m_cols[0]], label='p2m')
    plt.legend(); plt.title('P2P vs P2M'); plt.show()

plt.figure(figsize=(10,4))
sns.histplot(df[total_col].dropna(), kde=True)
plt.title(f'Distribution of {total_col}'); plt.show()

plt.figure(figsize=(10,8))
num_df = df.select_dtypes(include=[np.number])
sns.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation heatmap'); plt.show()

In [None]:
# Cell 6 — Feature Engineering
data = df.copy()
data['month_idx'] = np.arange(len(data))
data['month_num'] = data[date_col].dt.month
data['year'] = data[date_col].dt.year
data['month_sin'] = np.sin(2 * np.pi * data['month_num']/12)
data['month_cos'] = np.cos(2 * np.pi * data['month_num']/12)

p2p_cols = [c for c in df.columns if 'p2p' in c.lower()]
p2m_cols = [c for c in df.columns if 'p2m' in c.lower()]
if p2p_cols and p2m_cols:
    data['p2p_to_p2m'] = data[p2p_cols[0]] / (data[p2m_cols[0]] + 1e-6)

total_cols = [c for c in data.columns if 'total' in c.lower() or 'value' in c.lower() or 'val' in c.lower()]
total_col = total_cols[0]
data['total_val_lag1'] = data[total_col].shift(1)
data['total_val_lag2'] = data[total_col].shift(2)
data['total_val_rolling3'] = data[total_col].rolling(window=3).mean()
data = data.dropna().reset_index(drop=True)
data.head()

In [None]:
# Cell 7 — Prepare features and target
FEATURES = ['month_idx','month_sin','month_cos','total_val_lag1','total_val_lag2','total_val_rolling3']
if 'p2p_to_p2m' in data.columns:
    FEATURES.append('p2p_to_p2m')

for f in FEATURES:
    if f not in data.columns:
        raise ValueError(f'Missing feature {f} in engineered data. Check preprocessing.')

X = data[FEATURES].values
y = data[total_col].values

split_idx = int(len(data) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

print('Train size:', X_train_s.shape, 'Test size:', X_test_s.shape)

In [None]:
# Cell 8 — Linear Regression
lr = LinearRegression()
lr.fit(X_train_s, y_train)
y_pred_lr = lr.predict(X_test_s)

def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

print('Linear Regression:', metrics(y_test, y_pred_lr))

In [None]:
# Cell 9 — Random Forest
rf = RandomForestRegressor(n_estimators=500, random_state=42)
rf.fit(X_train_s, y_train)
y_pred_rf = rf.predict(X_test_s)
print('Random Forest:', metrics(y_test, y_pred_rf))

importances = rf.feature_importances_
for f, imp in sorted(zip(FEATURES, importances), key=lambda x: x[1], reverse=True):
    print(f, '->', round(imp,4))

In [None]:
# Cell 10 — XGBoost
xg_reg = xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
xg_reg.fit(X_train_s, y_train, eval_metric='rmse', verbose=False)
y_pred_xgb = xg_reg.predict(X_test_s)
print('XGBoost:', metrics(y_test, y_pred_xgb))

In [None]:
# Cell 11 — Actual vs Predicted Plot
test_dates = data[date_col].iloc[split_idx:].dt.strftime('%Y-%m').tolist()

plt.figure(figsize=(12,6))
plt.plot(test_dates, y_test, marker='o', label='Actual')
plt.plot(test_dates, y_pred_lr, marker='o', label='LinearReg')
plt.plot(test_dates, y_pred_rf, marker='o', label='RandomForest')
plt.plot(test_dates, y_pred_xgb, marker='o', label='XGBoost')
plt.xticks(rotation=45); plt.legend(); plt.title('Actual vs Predicted total_val'); plt.show()

In [None]:
# Cell 12 — Save models & scaler
os.makedirs('artifacts', exist_ok=True)
joblib.dump(scaler, 'artifacts/scaler.pkl')
joblib.dump(lr, 'artifacts/linear_model.pkl')
joblib.dump(rf, 'artifacts/rf_model.pkl')
joblib.dump(xg_reg, 'artifacts/xgb_model.pkl')
print('Saved artifacts to ./artifacts')

## LSTM (sequence) model - Optional
Small LSTM model to demonstrate sequence forecasting. With limited data LSTM may overfit; this is for demonstration only.

In [None]:
# Cell 13 — Prepare sequences for LSTM
series = data[total_col].values
mms = MinMaxScaler()
series_scaled = mms.fit_transform(series.reshape(-1,1)).flatten()

SEQ_LEN = 6
X_seq, y_seq = [], []
for i in range(SEQ_LEN, len(series_scaled)):
    X_seq.append(series_scaled[i-SEQ_LEN:i])
    y_seq.append(series_scaled[i])
X_seq = np.array(X_seq); y_seq = np.array(y_seq)

split_seq = int(len(X_seq) * 0.8)
X_seq_train, X_seq_test = X_seq[:split_seq], X_seq[split_seq:]
y_seq_train, y_seq_test = y_seq[:split_seq], y_seq[split_seq:]
print('LSTM sequences sizes:', X_seq_train.shape, X_seq_test.shape)

In [None]:
# Cell 14 — Train small LSTM
tf.random.set_seed(42)
model = Sequential([
    LSTM(64, input_shape=(X_seq_train.shape[1], 1)),
    Dropout(0.3),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_seq_train.reshape(-1, SEQ_LEN, 1), y_seq_train, validation_split=0.1, epochs=200, batch_size=8, callbacks=[es], verbose=0)
print('LSTM trained. Best val loss:', min(history.history['val_loss']))

In [None]:
# Cell 15 — Evaluate LSTM
y_seq_pred = model.predict(X_seq_test.reshape(-1, SEQ_LEN, 1)).flatten()
y_seq_test_inv = mms.inverse_transform(y_seq_test.reshape(-1,1)).flatten()
y_seq_pred_inv = mms.inverse_transform(y_seq_pred.reshape(-1,1)).flatten()

print('LSTM metrics:', metrics(y_seq_test_inv, y_seq_pred_inv))

plt.figure(figsize=(10,5))
plt.plot(range(len(y_seq_test_inv)), y_seq_test_inv, label='Actual')
plt.plot(range(len(y_seq_pred_inv)), y_seq_pred_inv, label='Predicted')
plt.legend(); plt.title('LSTM Predictions vs Actual'); plt.show()

## Conclusion & Next Steps
- Summarize model comparison and suggest best model(s) for forecasting.
- Limitations: small sample size, need for external features like holidays and economic indicators.
- Next steps: hyperparameter tuning, time-series cross-validation, deploy best model as REST API.