In [None]:
import numpy as np
import pandas as pd
df1 = pd.read_csv('merged-df.csv')  
df2 = pd.read_csv('final_features_with_clusters.csv')  

df1['date'] = pd.to_datetime(df1['date'])
df1['year_month'] = df1['date'].dt.to_period('M')  

income = (
    df1[df1['amount'] > 0]
    .groupby(['user_id', 'year_month'])['amount']
    .mean()
    .reset_index()
    .rename(columns={'amount': 'mean_income'})
)

expense = (
    df1[df1['amount'] < 0]
    .groupby(['user_id', 'year_month'])['amount']
    .mean()
    .reset_index()
    .rename(columns={'amount': 'mean_expense'})
)

monthly = pd.merge(income, expense, on=['user_id', 'year_month'], how='outer')
monthly['year_month'] = monthly['year_month'].dt.to_timestamp()

In [None]:
monthly['year_month'] = pd.to_datetime(monthly['year_month'])

txn_monthly = (
    df1.groupby(['user_id', df1['date'].dt.to_period('M')])
    .agg(
        txn_count=('transaction_id', 'count'),
        total_income=('amount', lambda x: x[x > 0].sum()),
        total_expense=('amount', lambda x: x[x < 0].sum())
    )
    .reset_index()
)
txn_monthly['year_month'] = txn_monthly['date'].dt.to_timestamp()
txn_monthly = txn_monthly.drop(columns=['date'])

txn_monthly = txn_monthly.sort_values(['user_id', 'year_month'])
txn_monthly[['txn_count_prev', 'total_income_prev', 'total_expense_prev']] = (
    txn_monthly.groupby('user_id')[['txn_count', 'total_income', 'total_expense']].shift(1)
)

monthly_enhanced = pd.merge(
    monthly,
    txn_monthly[['user_id', 'year_month', 'txn_count_prev', 'total_income_prev', 'total_expense_prev']],
    on=['user_id', 'year_month'],
    how='left'
)

monthly_enhanced = monthly_enhanced.sort_values(['user_id', 'year_month']).reset_index(drop=True)
monthly_enhanced = monthly_enhanced.fillna(0)

In [None]:
result = monthly_enhanced.merge(df2, on='user_id', how='left')
result = result.sort_values(['user_id', 'year_month']).reset_index(drop=True)
result[['mean_income', 'mean_expense']] = result[['mean_income', 'mean_expense']].round(1)

In [None]:
long_df = pd.melt(
    result,
    id_vars=[
        'user_id', 
        'year_month',
        'txn_count_prev', 
        'total_income_prev', 
        'total_expense_prev'
    ],  
    value_vars=['mean_income', 'mean_expense'],  
    var_name='type',            
    value_name='mean_amount'     
)

long_df['type'] = long_df['type'].str.replace('mean_', '')
long_df = long_df.sort_values(['user_id', 'year_month', 'type']).reset_index(drop=True)
final_df = long_df.merge(df2, on='user_id', how='left')

In [None]:
cols_to_drop = [
    'year_month', 'type',
    'total_txn_count', 'total_amount', 'avg_amount', 'std_amount',
    'median_amount', 'first_txn', 'last_txn',
    'user_birth_year', 'user_retirement_age', 'trend_category'
]

final_df = final_df.drop(columns=cols_to_drop, errors='ignore')
final_df['user_gender'] = final_df['user_gender'].replace({'Male': 0, 'Female': 1})
final_df['risk_level'] = final_df['risk_level'].replace({'Low': 0, 'Medium': 1,'High':2})
final_df = final_df.round(1)
final_df.to_csv('sequence.csv', index=False)