In [9]:
import numpy as np
import pandas as pd

In [10]:
df = pd.read_csv(r"A:\GithuB\Credit Card Spending Analysis\data\Credit card transactions.csv")
df.head()

Unnamed: 0,index,City,Date,Card Type,Exp Type,Gender,Amount
0,0,"Delhi, India",29-Oct-14,Gold,Bills,F,82475
1,1,"Greater Mumbai, India",22-Aug-14,Platinum,Bills,F,32555
2,2,"Bengaluru, India",27-Aug-14,Silver,Bills,F,101738
3,3,"Greater Mumbai, India",12-Apr-14,Signature,Bills,F,123424
4,4,"Bengaluru, India",5-May-15,Gold,Bills,F,171574


In [11]:
df.shape

(26052, 7)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26052 entries, 0 to 26051
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      26052 non-null  int64 
 1   City       26052 non-null  object
 2   Date       26052 non-null  object
 3   Card Type  26052 non-null  object
 4   Exp Type   26052 non-null  object
 5   Gender     26052 non-null  object
 6   Amount     26052 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 1.4+ MB


In [13]:
df.describe()

Unnamed: 0,index,Amount
count,26052.0,26052.0
mean,13025.5,156411.537425
std,7520.708943,103063.254287
min,0.0,1005.0
25%,6512.75,77120.25
50%,13025.5,153106.5
75%,19538.25,228050.0
max,26051.0,998077.0


In [14]:
df.isnull().sum()

index        0
City         0
Date         0
Card Type    0
Exp Type     0
Gender       0
Amount       0
dtype: int64

In [15]:
df.duplicated().sum()

np.int64(0)

## Data Cleaning & Preprocessing

In [16]:
# Convert the 'Date' to Date time
df['Date'] = pd.to_datetime(df['Date'], format="%d-%b-%y", errors='coerce')

In [17]:
# Strip whitespaces & standardize columns
df['City'] = df['City'].str.strip()
df['Card Type'] = df['Card Type'].str.strip()
df['Exp Type'] = df['Exp Type'].str.strip()

In [18]:
df.isnull().sum()

index        0
City         0
Date         0
Card Type    0
Exp Type     0
Gender       0
Amount       0
dtype: int64

## Exploratory Data Analysis

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'seaborn'

In [None]:
sns.histplot(data=df, x='Amount', bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.xlabel("Transaction Amount")
plt.ylabel("Count of Transactions")
plt.show()

In [None]:
## Time Series Transactions
df['YearMonth'] = df['Date'].dt.to_period('M')
monthly_trend = df.groupby('YearMonth')['Amount'].sum()
monthly_trend

In [None]:
monthly_trend.plot(title="Time Series of Transactions")
plt.ylabel('Amount')
plt.show()

In [None]:
df['Card Type'].unique()

In [None]:
## Card Type Usage
sns.countplot(data=df, x='Card Type', order=df['Card Type'].value_counts().index)
plt.title("Card Type Usage pattern")
plt.xticks(rotation=45)
plt.show()

In [None]:
df['Exp Type'].unique()

In [None]:
## For what Expense Types are Credit Cards used?
sns.countplot(data=df, x='Exp Type', order=df['Exp Type'].value_counts().index)
plt.title("Expense Type Usage Pattern")
plt.xticks(rotation=45)
plt.show()

In [None]:
gender_usage = df.groupby('Gender')['Amount'].sum()
gender_usage

In [None]:
# Specifically Gender wise who uses this cards most often?
sns.countplot(data=df, x='Gender', order=df['Gender'].value_counts().index)
plt.title("Gender wise usage of Credit Card")
plt.xticks(rotation=45)
plt.show()

## Feature Engineering

In [None]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday

In [None]:
df.head()

In [None]:
df_encoded = pd.get_dummies(df, columns=['City', 'Card Type', 'Exp Type', 'Gender'], drop_first=True)
df_encoded.head()

## Aggregation Metrics

In [None]:
# Total spending of each cities
total_city_spend = df.groupby('City')['Amount'].sum().sort_values(ascending=False).head(5)
total_city_spend

In [None]:
plt.figure(figsize=(12, 5))
total_city_spend.plot(kind='bar', color='green')
plt.title("Top 5 City Total Spending")
plt.xticks(rotation=45)
plt.show()

In [None]:
transaction_card_type = df['Card Type'].value_counts()
transaction_card_type

In [None]:
transaction_card_type.plot(kind='bar', color='lightgreen')
plt.title("Count of Card Type")
plt.show()

In [None]:
# Transaction Amount by Card
amount_by_card = df.groupby('Card Type')['Amount'].sum().sort_values(ascending=False)
amount_by_card

In [None]:
transaction_card_type.plot(kind='bar', color='purple')
plt.title("Transaction Amount by Card Type")
plt.xticks(rotation=45)
plt.show()

## Anamoly Detection

In [None]:
import numpy as np
from scipy import stats

In [None]:
# Z-Score for anamoly detection
z_score = np.abs(stats.zscore(df['Amount']))
df['Anomaly'] = z_score > 3

In [None]:
# Visualize anomalies
plt.figure(figsize=(10,5))
sns.scatterplot(x=df['Date'], y=df['Amount'], hue=df['Anomaly'])
plt.title("Transaction Amounts with Anomalies")
plt.show()

## Time Series Forecasting

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
## Monthly Aggregation
ts_data = df.set_index('Date').resample('M')['Amount'].sum()

In [None]:
## Fit ARIMA model
model = ARIMA(ts_data, order=(1, 1, 1))
results = model.fit()

In [None]:
## Forecast next 6 months
forecast = results.forecast(steps=6)
print(forecast)

In [None]:
ts_data.plot(label='Historical')
forecast.plot(label='Forecast', linestyle='--')
plt.title("ARIMA Forecasting")
plt.legend()
plt.show()

## Machine Learning Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Using encoded data
# Using encoded data
X = df_encoded.drop(columns=['Gender_M', 'Amount', 'Date', 'YearMonth'])
y = df_encoded['Gender_M']  # Predicting if Male

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
# Predictions & Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))