
# SEGMENTATION


In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load the dataset
data = pd.read_csv('simulated_transaction_2024.csv')
data.head()

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
0,01/01/2023,00:00,678330503.0,2971.0,1584.0,,Westport Care Home
1,01/01/2023,00:00,472213568.0,3792.0,1950.0,,Barbiee Boutique
2,01/01/2023,00:00,472213568.0,3012.0,-780.0,283027736.0,
3,01/01/2023,00:00,283027736.0,1787.0,780.0,472213568.0,
4,01/01/2023,00:00,624500124.0,3226.0,1825.0,,Fat Face


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230596 entries, 0 to 230595
Data columns (total 7 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Date                    230367 non-null  object 
 1   Timestamp               230345 non-null  object 
 2   Account No              230372 non-null  float64
 3   Balance                 230350 non-null  float64
 4   Amount                  230387 non-null  float64
 5   Third Party Account No  6832 non-null    float64
 6   Third Party Name        223517 non-null  object 
dtypes: float64(4), object(3)
memory usage: 12.3+ MB


In [3]:
data.shape 

(230596, 7)

In [4]:
data.columns

Index(['Date', 'Timestamp', 'Account No', 'Balance', 'Amount',
       'Third Party Account No', 'Third Party Name'],
      dtype='object')

In [5]:
data.dtypes

Date                       object
Timestamp                  object
Account No                float64
Balance                   float64
Amount                    float64
Third Party Account No    float64
Third Party Name           object
dtype: object

In [6]:
data.describe()

Unnamed: 0,Account No,Balance,Amount,Third Party Account No
count,230372.0,230350.0,230387.0,6832.0
mean,550599300.0,3609.351057,6.437717,537026300.0
std,256061000.0,5379.331174,508.200101,259692700.0
min,101531300.0,-5790.9835,-12240.0,101908500.0
25%,331450200.0,373.453932,-74.0,311467900.0
50%,550169800.0,1686.00985,-28.7,538149100.0
75%,768271800.0,5215.031852,-12.06,748509700.0
max,999752700.0,46804.158837,12240.0,999752700.0


In [7]:
data.isnull().sum()

Date                         229
Timestamp                    251
Account No                   224
Balance                      246
Amount                       209
Third Party Account No    223764
Third Party Name            7079
dtype: int64

In [11]:
data.nunique()

Date              340
Timestamp        1401
Account No        976
Balance        229888
Amount          17314
Third Party       756
dtype: int64

In [8]:
# Convert 'Third Party Account No' to nullable integer type
data['Third Party Account No'] = data['Third Party Account No'].astype('Int64')

In [9]:
# Create a new 'Third Party' column
data['Third Party'] = data['Third Party Name'].combine_first(data['Third Party Account No'])

In [10]:
# Drop the original 'Third Party Account No' and 'Third Party Name' columns
data.drop(columns=['Third Party Account No', 'Third Party Name'], inplace=True)

In [12]:
data.isnull().sum()

Date           229
Timestamp      251
Account No     224
Balance        246
Amount         209
Third Party    247
dtype: int64

In [13]:
data = data.dropna()
# Check null values after dropping
print(data.isnull().sum())

Date           0
Timestamp      0
Account No     0
Balance        0
Amount         0
Third Party    0
dtype: int64


In [14]:
# Identify debits and credits
debits = data[data['Amount'] < 0]
credits = data[data['Amount'] > 0]

In [15]:
# nearest integer/ rounding
data['Balance'] = data['Balance'].round().astype(int)
data['Amount'] = data['Amount'].round().astype(int)

In [17]:
data['Account No'] = data['Account No'].round().astype(int)

In [19]:
data

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party
0,01/01/2023,00:00,678330503,2971,1584,Westport Care Home
1,01/01/2023,00:00,472213568,3792,1950,Barbiee Boutique
2,01/01/2023,00:00,472213568,3012,-780,283027736
3,01/01/2023,00:00,283027736,1787,780,472213568
4,01/01/2023,00:00,624500124,3226,1825,Fat Face
...,...,...,...,...,...,...
230591,06/12/2023,20:54,581655972,45935,-41,Tesco
230592,06/12/2023,20:55,786141370,-245,-62,Sainsbury Local
230593,06/12/2023,21:05,824916823,9709,-33,Deliveroo
230594,06/12/2023,21:13,366550080,26834,-19,Amazon


# Segmentation by Transaction Amount

In [None]:
# Define bins and labels for transaction amount segmentation
amount_bins = [-float('inf'), 25, 50, float('inf')]
amount_labels = ['Low', 'Medium', 'High']

# Segment transactions based on Amount
data['Amount_Segment'] = pd.cut(data['Amount'], bins=amount_bins, labels=amount_labels)
print(data['Amount_Segment'].value_counts())


# Transaction Frequency Segmentation

In [None]:
# Count the number of transactions per account
transaction_freq = data['Account No'].value_counts()

# Define thresholds based on quantiles for segmentation
freq_thresholds = [0, transaction_freq.quantile(0.33), transaction_freq.quantile(0.66), float('inf')]
freq_labels = ['Low', 'Medium', 'High']

# Segment customers based on transaction frequency
data['Freq_Segment'] = pd.cut(data['Account No'].map(transaction_freq), bins=freq_thresholds, labels=freq_labels)
print(data['Freq_Segment'].value_counts())

In [None]:
## monthly

In [None]:
# Count the number of outlier transactions for each account
account_outlier_counts = outlier_transactions['Account No'].value_counts()

# Merge this with the main data to see if these accounts also show unusual transaction times
outlier_accounts = pd.DataFrame(account_outlier_counts).reset_index()
outlier_accounts.columns = ['Account No', 'Outlier Count']

# Examine accounts with the highest number of outlier transactions
high_outlier_accounts = outlier_accounts[outlier_accounts['Outlier Count'] > outlier_accounts['Outlier Count'].quantile(0.95)]

# Display the details of accounts with high outlier transactions
high_outlier_accounts.head(), high_outlier_accounts.describe()


In [None]:
data['Third Party'].nunique()

In [None]:
top_parties = data['Third Party'].value_counts().head(10)

plt.figure(figsize=(7,3))
sns.barplot(x=top_parties.values, y=top_parties.index, palette='viridis')
plt.title('Top 10 Third Parties by Transaction Frequency', fontsize=16)
plt.xlabel('Number of Transactions', fontsize=14)
plt.ylabel('Third Party', fontsize=14)
plt.grid(True, axis='x')
plt.show()

# Temporal analysis:

In [None]:
## EDA

# Assuming 'data' is your DataFrame
data['Datetime'] = pd.to_datetime(data['Datetime'])

# Descriptive statistics
print(data.describe())
print(data['Transaction Type'].value_counts())

# Missing values check
print(data.isnull().sum())

# New features
data['Day of Week'] = data['Datetime'].dt.day_name()
data['Hour of Day'] = data['Datetime'].dt.hour

# Transaction trends
plt.figure(figsize=(14, 7))
sns.lineplot(x='Datetime', y='Amount', data=data)
plt.title('Transaction Amount Over Time')
plt.xlabel('Datetime')
plt.ylabel('Amount')
plt.show()

# Distribution of transactions
plt.figure(figsize=(10, 6))
sns.histplot(data['Amount'], bins=30, kde=True)
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Transaction Amount')
plt.show()

# Balance over time for a specific account
sample_account = data[data['Account No'] == data['Account No'].unique()[0]]
plt.figure(figsize=(14, 7))
plt.plot(sample_account['Datetime'], sample_account['Balance'], marker='o')
plt.title('Balance Over Time for Account No: {}'.format(sample_account['Account No'].iloc[0]))
plt.xlabel('Datetime')
plt.ylabel('Balance')
plt.grid(True)
plt.show()

# Correlation matrix
corr = data[['Balance', 'Amount']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
data

In [None]:
# Assuming data is already loaded into a DataFrame named `data`
# Convert 'Datetime' from string to datetime format if not already done
data['Datetime'] = pd.to_datetime(data['Datetime'])

# Create time-based features
data['Hour'] = data['Datetime'].dt.hour
data['Day of Week'] = data['Datetime'].dt.day_name()
data['Month'] = data['Datetime'].dt.month

# Plot transactions over time to identify peaks
plt.figure(figsize=(12, 6))
sns.lineplot(x='Hour', y='Amount', data=data, estimator=sum, ci=None)
plt.title('Total Transaction Amount by Hour of Day')
plt.show()

# Analyzing transactions by day of week
plt.figure(figsize=(12, 6))
sns.barplot(x='Day of Week', y='Amount', data=data, estimator=sum)
plt.title('Total Transaction Amount by Day of Week')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load and prepare the data
data['Month-Year'] = data['Datetime'].dt.to_period('M')
monthly_data = data.groupby('Month-Year')['Amount'].sum()

# Plotting a line graph of monthly transaction volumes over multiple years
plt.figure(figsize=(14, 7))
monthly_data.plot(title='Monthly Transaction Volumes Over Years')
plt.xlabel('Month-Year')
plt.ylabel('Total Transaction Volume')
plt.grid(True)
plt.show()

# Creating a heatmap of monthly data for multiple years
data['Year'] = data['Datetime'].dt.year
data['Month'] = data['Datetime'].dt.month
pivot_table = data.pivot_table(values='Amount', index='Month', columns='Year', aggfunc='sum')

plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, annot=True, fmt=".0f", linewidths=.5, cmap='Blues')
plt.title('Heatmap of Transaction Volumes by Month and Year')
plt.xlabel('Year')
plt.ylabel('Month')
plt.show()


In [None]:
## marketing

In [None]:
data['Third Party Name'] = data['Third Party'].apply(lambda x: x if isinstance(x, str) else None)
data['Third Party Account No'] = data['Third Party'].apply(lambda x: x if isinstance(x, int) else None)

In [None]:
data['Third Party Name'] = data['Third Party Name'].str.upper().str.strip()

In [None]:
frequent_parties = data['Third Party Name'].value_counts().head(10)

In [None]:
frequent_parties

In [None]:
pivot_data = data.pivot_table(values='Amount', index='Third Party Name', aggfunc=['sum', 'count'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Top third parties by transaction count
plt.figure(figsize=(10, 6))
sns.barplot(x=frequent_parties.values, y=frequent_parties.index)
plt.title('Most Frequent Third Parties')
plt.xlabel('Number of Transactions')
plt.ylabel('Third Party')
plt.show()

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame already loaded from previous analysis or an external file

# Define the top third parties from your provided list
frequent_parties = ['Tesco', 'Sports Direct', 'Topshop', 'Amazon', 'Fat Face', 
                    'PureGym', 'Matalan', 'Netflix', 'JustEat', 'Deliveroo']

# Filter your data to include only transactions with these top third parties
top_data = data[data['Third Party'].isin(frequent_parties)]

# Customer Profiling
# Define a function to create profiles based on transactions with specific third parties
def identify_profile(third_party):
    if third_party in ['Amazon']:
        return 'Tech-Savvy Shopper'
    elif third_party in ['PureGym']:
        return 'Fitness Enthusiast'
    elif third_party in ['Sports Direct']:
        return 'Sports Equipment Lover'
    elif third_party in ['Tesco', 'Topshop', 'Fat Face', 'Matalan']:
        return 'Regular Retail Shopper'
    elif third_party in ['Netflix', 'JustEat', 'Deliveroo']:
        return 'Home Entertainment Lover'
    else:
        return 'General Shopper'

top_data['Customer Profile'] = top_data['Third Party'].apply(identify_profile)

# Personalized Communication
# Map personalized messages to different customer segments
personalized_messages = {
    'Tech-Savvy Shopper': "Explore the latest tech gadgets on Amazon with our exclusive card rewards!",
    'Fitness Enthusiast': "Stay fit and healthy! Enjoy special offers from PureGym just for you.",
    'Sports Equipment Lover': "Get ready to elevate your game with the best deals from Sports Direct!",
    'Regular Retail Shopper': "Discover the best shopping deals and enjoy exclusive rewards at Tesco and more!",
    'Home Entertainment Lover': "Enhance your home entertainment with exclusive offers from Netflix, JustEat, and Deliveroo!"
}

top_data['Personalized Message'] = top_data['Customer Profile'].map(personalized_messages)

# Product Customization
# Suggest products based on customer profiles
product_suggestions = {
    'Tech-Savvy Shopper': 'High-Tech Rewards Card',
    'Fitness Enthusiast': 'Fitness Fanatic Credit Card',
    'Sports Equipment Lover': 'Sports Rewards Card',
    'Regular Retail Shopper': 'Premium Retail Rewards Card',
    'Home Entertainment Lover': 'Entertainment Rewards Card'
}

top_data['Suggested Product'] = top_data['Customer Profile'].map(product_suggestions)

# Display the enhanced dataset with profiles, messages, and product suggestions
print(top_data[['Account No', 'Third Party', 'Customer Profile', 'Personalized Message', 'Suggested Product']])


In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame already loaded from previous analysis

# Filter data for only the top third parties you are interested in
frequent_parties = ['Tesco', 'Sports Direct', 'Topshop', 'Amazon', 'Fat Face', 
                    'PureGym', 'Matalan', 'Netflix', 'JustEat', 'Deliveroo']
top_data = data[data['Third Party'].isin(frequent_parties)]

# Calculate frequency of transactions for each third party
transaction_frequencies = top_data['Third Party'].value_counts()

# Calculate total amount spent at each third party
total_spent = top_data.groupby('Third Party')['Amount'].sum()

# Calculate the number of unique users (accounts) for each third party
unique_users = top_data.groupby('Third Party')['Account No'].nunique()

# Combine all these metrics into a single DataFrame for a comprehensive overview
third_party_analysis = pd.DataFrame({
    'Frequency of Transactions': transaction_frequencies,
    'Total Amount Spent': total_spent,
    'Unique Users': unique_users
})

# Print the combined analysis
print(third_party_analysis)


### Analaysis
Transaction Frequencies: This metric shows how many transactions each third party has, which helps identify the most frequented businesses by your customers.

Total Amount Spent: Knowing the total amount spent can indicate the financial significance of each third party in your customers' spending habits.

Unique Users: The number of unique users transacting with each third party highlights how widely used a third party is among your customer base, which can be crucial for targeted marketing strategies.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'top_data' is already loaded and filtered for top third parties

# Calculate necessary metrics
transaction_frequencies = top_data['Third Party'].value_counts()
total_spent = top_data.groupby('Third Party')['Amount'].sum()
unique_users = top_data.groupby('Third Party')['Account No'].nunique()

# Creating a DataFrame for easy visualization
third_party_metrics = pd.DataFrame({
    'Transaction Frequencies': transaction_frequencies,
    'Total Amount Spent': total_spent,
    'Unique Users': unique_users
}).sort_values(by='Transaction Frequencies', ascending=False)  # Sorting for better visual representation

# Plotting
plt.figure(figsize=(14, 7))

# Subplot 1: Transaction Frequencies
plt.subplot(1, 3, 1)
sns.barplot(x=third_party_metrics['Transaction Frequencies'], y=third_party_metrics.index, palette='viridis')
plt.title('Transaction Frequencies')
plt.xlabel('Number of Transactions')
plt.ylabel('Third Party')

# Subplot 2: Total Amount Spent
plt.subplot(1, 3, 2)
sns.barplot(x=third_party_metrics['Total Amount Spent'], y=third_party_metrics.index, palette='viridis')
plt.title('Total Amount Spent')
plt.xlabel('Total Spent ($)')
plt.ylabel('')

# Subplot 3: Unique Users
plt.subplot(1, 3, 3)
sns.barplot(x=third_party_metrics['Unique Users'], y=third_party_metrics.index, palette='viridis')
plt.title('Unique Users')
plt.xlabel('Number of Unique Users')
plt.ylabel('')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'top_data' is already loaded and filtered for top third parties

# Calculate the metrics for each third party
transaction_frequencies = top_data['Third Party'].value_counts()
total_spent = top_data.groupby('Third Party')['Amount'].sum()
unique_users = top_data.groupby('Third Party')['Account No'].nunique()

# Creating a DataFrame for the analysis
analysis_df = pd.DataFrame({
    'Transaction Frequencies': transaction_frequencies,
    'Total Amount Spent': total_spent,
    'Unique Users': unique_users
})

# Ensure the DataFrame only contains the top ten third parties based on transaction frequencies
top_ten_third_parties = analysis_df.nlargest(10, 'Transaction Frequencies')

# Generating a pairplot of the top ten third parties
sns.pairplot(top_ten_third_parties, kind='scatter', diag_kind='kde', plot_kws={'alpha': 0.6, 's': 80, 'edgecolor': 'k'})
plt.suptitle('Pairplot of Transaction Metrics for Top Ten Third Parties')
plt.show()

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame and has the necessary columns
# Select the features you want to use for clustering
X = data[['Balance', 'Amount']].dropna()  # Dropping NA values to avoid errors during fitting

# Scale the data if necessary - Here we are assuming it's already appropriately scaled for simplicity

# Initialize KMeans with three clusters
kmeans = KMeans(n_clusters=3, random_state=0)
# Fit the model and predict cluster indices for X
X['Cluster'] = kmeans.fit_predict(X)

# Append cluster information back to the original data
data = data.join(X['Cluster'], how='left', rsuffix='_cluster')

# Analyze the characteristics of each cluster
cluster_summary = data.groupby('Cluster').agg({
    'Balance': ['mean', 'std'],
    'Amount': ['mean', 'std', 'count']
})
print(cluster_summary)

# Visualize the clusters
sns.scatterplot(x='Balance', y='Amount', hue='Cluster', data=X, palette='viridis')
plt.title('Customer Segmentation by Balance and Transaction Amount')
plt.xlabel('Balance')
plt.ylabel('Amount')
plt.legend(title='Cluster')
plt.show()


In [None]:
data

## Transactions that are several standard deviations away from the mean might be considered suspicious:
- Transactions that are several standard deviations away from the mean might be considered suspicious:

In [None]:
# Calculate the mean and standard deviation
mean_amount = data['Amount'].mean()
std_amount = data['Amount'].std()

# Identify outliers as transactions that are more than 3 standard deviations from the mean
outliers = data[np.abs(data['Amount'] - mean_amount) > 3 * std_amount]

# Display outliers
outliers[['Datetime', 'Amount', 'Account No', 'Third Party', 'Transaction Type']]

## transactions occurring repeatedly at the same time to the same third party, which could indicate automated or scripted fraud:
- more than 5 times

In [None]:
# Group by account number, third party, and hour to see repetitive patterns
repetitive_patterns = data.groupby(['Account No', 'Third Party', 'Timestamp']).size().reset_index(name='Count')

# Filter for patterns that occur more than a threshold, e.g., more than 5 times
suspicious_patterns = repetitive_patterns[repetitive_patterns['Count'] > 5]

# Display suspicious repetitive patterns
suspicious_patterns


## Visualisations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot transaction amounts for outliers
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Date', y='Amount', hue='Transaction Direction', data=outliers)
plt.title('Outlier Transactions Over Time')
plt.xlabel('Date')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot transaction amounts for potential outliers
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Date', y='Amount', hue='Transaction Direction', style='Day of Week', data=data)
plt.title('Transaction Amounts Over Time')
plt.xlabel('Date')
plt.ylabel('Transaction Amount')
plt.show()


In [None]:
# Create pairplot for selected columns to visualize relationships and distributions
sns.pairplot(data[['Amount', 'Balance', 'Transaction Direction']], hue='Transaction Direction', plot_kws={'alpha': 0.5})
plt.suptitle('Pairplot of Transactions', size=16, y=1.02)
plt.show()

In [None]:
##

In [None]:
# Create individual scatter plots instead of a pairplot to avoid issues with KDE and non-numeric data types
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Scatter plot of Amount vs. Balance
sns.scatterplot(x='Amount', y='Balance', hue='Transaction Direction', data=data, ax=axs[0], alpha=0.5)
axs[0].set_title('Scatter Plot of Amount vs. Balance')
axs[0].set_xlabel('Amount')
axs[0].set_ylabel('Balance')

# Scatter plot of Amount vs. Hour of Transaction
sns.scatterplot(x='Amount', y='Timestamp', hue='Transaction Direction', data=data, ax=axs[1], alpha=0.5)
axs[1].set_title('Scatter Plot of Amount vs. Hour of Transaction')
axs[1].set_xlabel('Amount')
axs[1].set_ylabel('Hour of Transaction')

plt.tight_layout()
plt.show()

## Monthly Banking Activity: Insights from Total Spent, Total Credited, and Transaction Count

In [None]:

# Using 'Grouper' to group by month directly

monthly_data = data.groupby(['Account No', pd.Grouper(key='Date', freq='M')]).agg(
    Total_Spent=pd.NamedAgg(column='Amount', aggfunc=lambda x: x[x < 0].sum()),
    Total_Credited=pd.NamedAgg(column='Amount', aggfunc=lambda x: x[x >= 0].sum()),
    Transaction_Count=pd.NamedAgg(column='Amount', aggfunc='count')
).reset_index()

# Converting 'Date' to 'Year-Month' format for easier reading
monthly_data['Date'] = monthly_data['Date'].dt.to_period('M')

# Display the first few rows of the modified monthly statistics
monthly_data.head(5)


In [None]:
monthly_data.shape

In [None]:
data['Account No'].unique().shape

## checking all the users spending evry month or not

In [None]:
# To ensure each account's activity is accounted for in every expected month of the dataset, 
# we'll check which months each account has transactions and list any missing months.

# First, get the full range of months from the dataset
full_date_range = pd.period_range(data['Date'].min(), data['Date'].max(), freq='M')

# Dictionary to store missing months for each account
missing_months_dict = {}

# Iterating over each account
for account, group in data.groupby('Account No'):
    present_months = group['Date'].dt.to_period('M').unique()
    missing_months = full_date_range[~full_date_range.isin(present_months)]
    if len(missing_months) > 0:
        missing_months_dict[account] = missing_months

# Create a list to store tuples of account number and missing month
missing_months_list = []

# Iterate over the dictionary and append tuples to the list
for account, missing_months in missing_months_dict.items():
    for month in missing_months:
        missing_months_list.append((account, month))

# Create a DataFrame from the list
missing_months_df = pd.DataFrame(missing_months_list, columns=['Account No', 'Missing Month'])

# Display the DataFrame
print(missing_months_df)


In [None]:
## no.of transactions on each day..smtwthfss..
# find any trends
## his status at the end of month- whether he is in debt or credit

In [None]:
data

## fraud detection using transactional data

In [None]:
# Convert 'Date' and 'Timestamp' into a single datetime column for easier manipulation
#data['Timestamp'] = pd.to_datetime(data['Date'] + ' ' + data['Timestamp'], errors='coerce')

# Calculate mean and standard deviation for each account without storing them as columns
mean_amounts = data.groupby('Account No')['Amount'].transform('mean')
std_amounts = data.groupby('Account No')['Amount'].transform('std')

# Identify high-value transactions
high_value = data['Amount'] > (mean_amounts + 3 * std_amounts)

# Detect frequent transactions in short times by counting the transactions per minute for each account
frequent_transactions = data.groupby(['Account No', data['Timestamp'].dt.floor('min')])['Amount'].transform('size') > 3

# Check for transactions that are round figures
round_figure_transaction = data['Amount'].mod(1000) == 0

# Combine the criteria to filter potential fraudulent transactions
suspicious_transactions = data[high_value | frequent_transactions | round_figure_transaction]

# Display the suspicious transactions
print(suspicious_transactions[['Date', 'Timestamp', 'Account No', 'Amount', 'Third Party', 'Transaction Direction']])


In [None]:
suspicious_transactions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming suspicious_transactions is already defined
# and data['Timestamp'] has been created and is the datetime combination of 'Date' and 'Timestamp'

# 1. Scatter Plot of Transaction Amounts Over Time
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Timestamp', y='Amount', data=suspicious_transactions, hue='Transaction Direction', style='Transaction Direction')
plt.title('Suspicious Transactions Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Transaction Amount')
plt.xticks(rotation=45)
plt.legend(title='Transaction Type')
plt.tight_layout()
plt.show()

# 2. Histogram of Transaction Amounts
plt.figure(figsize=(10, 6))
sns.histplot(suspicious_transactions['Amount'], bins=30, kde=False, color='red')
plt.title('Distribution of Suspicious Transaction Amounts')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming suspicious_transactions is already defined
# and data['Timestamp'] has been created and is the datetime combination of 'Date' and 'Timestamp'

# Extract more features from 'Timestamp' if needed
suspicious_transactions['Hour of Day'] = suspicious_transactions['Timestamp'].dt.hour
suspicious_transactions['Day of Week'] = suspicious_transactions['Timestamp'].dt.dayofweek

# Select columns to include in the pairplot
# Including 'Amount', 'Hour of Day', and 'Day of Week'
# If 'Transaction Direction' is categorical, it can be used as a hue
plot_data = suspicious_transactions[['Amount', 'Hour of Day', 'Day of Week', 'Transaction Direction']]

# Create the pairplot
sns.pairplot(plot_data, hue='Transaction Direction', diag_kind='kde', plot_kws={'alpha': 0.6, 's': 60, 'edgecolor': 'k'}, height=3)
plt.suptitle('Pairplot of Suspicious Transactions')
plt.show()
