<a href="https://colab.research.google.com/github/SinnottKayleigh/B2B-Sales-Algos/blob/main/Micro_Macro_Segmenting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Uses ML to segment customers, examine attributes that may lead to sales
- Industry, region, size
-  Buying patterns - volume, deal size, fx turnover, transaction timing
- To personalise value proposition and prioritise leads
- Pyvis is a python library used to map relationships between variables and data, in a neural network type chart
- 4 ways to segment and understand relationships between client and prospect attributes.

1. Demographic Criteria
2. Operating Variables
3. Purchasing Approaches
4. Situational Factors

- Eliminates the time wasted on low-yield opportunities, provides more accurate information for better decision making; optimising resources and time.

- Cohort Analysis: involves grouping users based on behavioural characteristics, such as churn rate; the identification of patterns among segments.



In [20]:
from pyvis.network import Network
import networkx as nx

Synthetic dataset

In [21]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_customers = 50
customer_ids = [f"CUST_{i+1}" for i in range(n_customers)]
average_trade_sizes = np.random.randint(10000, 1000000, size=n_customers)
credit_ratings = np.random.choice(["AAA", "AA", "A", "BBB"], size=n_customers)
churn_likelihoods = np.round(np.random.uniform(0, 1, size=n_customers), 2)
decision_makers = np.random.choice(["CFO", "Treasury Manager", "CEO", "Head of FX"], size=n_customers)

hq_locations = ["UK"] * n_customers
subsidiary_locations = np.random.choice(["USA", "Germany", "France", "Singapore", "Japan"], size=n_customers)

data = pd.DataFrame({
    "Customer ID": customer_ids,
    "Average Trade Size": average_trade_sizes,
    "HQ Location": hq_locations,
    "Subsidiary Location": subsidiary_locations,
    "Credit Rating": credit_ratings,
    "Churn Likelihood": churn_likelihoods,
    "Key Decision Maker": decision_makers
})

print(data.head())

  Customer ID  Average Trade Size HQ Location Subsidiary Location  \
0      CUST_1              131958          UK                 USA   
1      CUST_2              681155          UK           Singapore   
2      CUST_3              141932          UK               Japan   
3      CUST_4              375838          UK                 USA   
4      CUST_5              269178          UK              France   

  Credit Rating  Churn Likelihood Key Decision Maker  
0            AA              0.84         Head of FX  
1             A              0.45                CEO  
2           BBB              0.40                CFO  
3             A              0.93         Head of FX  
4           BBB              0.73                CFO  


In [22]:
from pyvis.network import Network
import pandas as pd

net = Network(notebook=True, height="750px", width="100%")

for _, row in data.iterrows():
    node_id = row["Customer ID"]
    node_label = f"{node_id}\n({row['Key Decision Maker']})"
    net.add_node(
        node_id,
        label=node_label,
        title=f"""
        Customer ID: {node_id}
        Average Trade Size: ${row['Average Trade Size']:,}
        HQ Location: {row['HQ Location']}
        Subsidiary Location: {row['Subsidiary Location']}
        Credit Rating: {row['Credit Rating']}
        Churn Likelihood: {row['Churn Likelihood']}
        Key Decision Maker: {row['Key Decision Maker']}
        """,
        size=row["Average Trade Size"] / 10000
    )

for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if data.loc[i, "Key Decision Maker"] == data.loc[j, "Key Decision Maker"]:
            net.add_edge(data.loc[i, "Customer ID"], data.loc[j, "Customer ID"], title="Same Decision Maker")

net.toggle_hide_edges_on_drag(True)
net.show_buttons(filter_=['physics'])
net.show("customer_network.html")

customer_network.html


In [23]:
from pyvis.network import Network

net = Network(notebook=True, height="750px", width="100%", cdn_resources='remote')

for _, row in data.iterrows():
    node_id = row["Customer ID"]
    node_label = f"{node_id}\n({row['Key Decision Maker']})"
    net.add_node(
        node_id,
        label=node_label,
        title=f"""
        Customer ID: {node_id}
        Average Trade Size: ${row['Average Trade Size']:,}
        HQ Location: {row['HQ Location']}
        Subsidiary Location: {row['Subsidiary Location']}
        Credit Rating: {row['Credit Rating']}
        Churn Likelihood: {row['Churn Likelihood']}
        Key Decision Maker: {row['Key Decision Maker']}
        """,
        size=row["Average Trade Size"] / 10000
    )

for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if data.loc[i, "Key Decision Maker"] == data.loc[j, "Key Decision Maker"]:
            net.add_edge(data.loc[i, "Customer ID"], data.loc[j, "Customer ID"], title="Same Decision Maker")

net.toggle_hide_edges_on_drag(True)
net.show_buttons(filter_=['physics'])
net.show("customer_network.html")

customer_network.html


In [27]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_bubble_chart(data):
    fig = px.scatter(data,
                    x='Average Trade Size',
                    y='Churn Likelihood',
                    size='Average Trade Size',
                    color='Credit Rating',
                    hover_name='Customer ID',
                    text='Key Decision Maker',
                    title='Customer Analysis - Trade Size vs Churn Risk')

    fig.update_traces(textposition='top center')
    return fig

def create_treemap(data):
    fig = px.treemap(data,
                    path=['HQ Location', 'Subsidiary Location', 'Key Decision Maker', 'Customer ID'],
                    values='Average Trade Size',
                    color='Credit Rating',
                    title='Customer Hierarchy by Location and Decision Maker')
    return fig

def create_dashboard(data):
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Trade Size Distribution', 'Decision Maker Distribution',
                       'Credit Rating Distribution', 'Churn Risk Distribution')
    )

    fig.add_trace(
        go.Histogram(x=data['Average Trade Size'], name='Trade Size'),
        row=1, col=1
    )

    decision_maker_counts = data['Key Decision Maker'].value_counts()
    fig.add_trace(
        go.Bar(x=decision_maker_counts.index, y=decision_maker_counts.values,
               name='Decision Makers'),
        row=1, col=2
    )

    credit_rating_counts = data['Credit Rating'].value_counts()
    fig.add_trace(
        go.Bar(x=credit_rating_counts.index, y=credit_rating_counts.values,
               name='Credit Ratings'),
        row=2, col=1
    )

    fig.add_trace(
        go.Histogram(x=data['Churn Likelihood'], name='Churn Risk'),
        row=2, col=2
    )

    fig.update_layout(height=800,
                     showlegend=True,
                     title_text="Customer Analysis Dashboard")

    fig.update_xaxes(title_text="Trade Size", row=1, col=1)
    fig.update_xaxes(title_text="Decision Maker", row=1, col=2)
    fig.update_xaxes(title_text="Credit Rating", row=2, col=1)
    fig.update_xaxes(title_text="Churn Likelihood", row=2, col=2)

    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=2)

    return fig

def create_decision_maker_pie(data):
    decision_maker_counts = data['Key Decision Maker'].value_counts()
    fig = go.Figure(data=[go.Pie(labels=decision_maker_counts.index,
                                values=decision_maker_counts.values,
                                title='Decision Maker Distribution')])
    return fig

bubble_chart = create_bubble_chart(data)
treemap = create_treemap(data)
dashboard = create_dashboard(data)
decision_maker_pie = create_decision_maker_pie(data)
bubble_chart.show()
treemap.show()
dashboard.show()
decision_maker_pie.show()

1. Demographic Criteria (Who) (Static/Slow-changing)

- HQ location (Must be UK)
- FX-Exposure level; high, medium, low
- Industry sector; manufacturing, engineering, technology, travel
- Buy Side vs Sell Side


2. Operating Variables (How) (Dynamic/Process-based)

- Trade size distribution
- Trading window requirements
- Trading frequency by currency pair
- Confirmation methods
- Settlement Types
- Straight through processing rate
- Execution handling capacity
-

3. Purchasing Approaches
-

1. Demographic Criteria

- Feature importance will show which variables influence the sales cycle the most effectively, and those which do not.

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

np.random.seed(42)
n_samples = 100

data = {
    'Industry_Sector': np.random.choice(['Financial Services', 'Corporate', 'Institutional', 'Government'], n_samples),
    'Annual_FX_Volume': np.random.uniform(1e6, 1e9, n_samples),
    'Geographic_Complexity': np.random.randint(1, 10, n_samples),  # Scale 1-10
    'Treasury_Structure': np.random.choice(['Centralized', 'Decentralized'], n_samples),
    'Market_Position': np.random.choice(['Market Maker', 'Market User'], n_samples),
    'Credit_Rating': np.random.choice(['AAA', 'AA', 'A', 'BBB', 'BB'], n_samples),
    'Cross_Border_Revenue_Pct': np.random.uniform(0, 100, n_samples),
    'Regulatory_Complexity': np.random.randint(1, 10, n_samples),  # Scale 1-10
    'Balance_Sheet_Size': np.random.uniform(1e7, 1e10, n_samples)
}

df = pd.DataFrame(data)

feature_importance = {
    'Annual_FX_Volume': 0.85,
    'Geographic_Complexity': 0.76,
    'Cross_Border_Revenue_Pct': 0.72,
    'Balance_Sheet_Size': 0.68,
    'Regulatory_Complexity': 0.65,
    'Credit_Rating': 0.58,
    'Industry_Sector': 0.52,
    'Treasury_Structure': 0.45,
    'Market_Position': 0.41
}

importance_df = pd.DataFrame({
    'Feature': list(feature_importance.keys()),
    'Importance': list(feature_importance.values())
}).sort_values('Importance', ascending=True)

colors = px.colors.sequential.Viridis

fig = go.Figure(go.Bar(
    x=importance_df['Importance'],
    y=importance_df['Feature'],
    orientation='h',
    marker=dict(
        color=importance_df['Importance'],
        colorscale='Viridis',
        showscale=True
    )
))

fig.update_layout(
    title={
        'text': 'Demographic Feature Importance in Sales Cycle Length',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title='Relative Importance',
    yaxis_title='Feature',
    height=600,
    template='plotly_white',
    xaxis=dict(range=[0, 1]),
    showlegend=False
)

for i in range(len(importance_df)):
    fig.add_annotation(
        x=importance_df['Importance'].iloc[i],
        y=importance_df['Feature'].iloc[i],
        text=f"{importance_df['Importance'].iloc[i]:.2f}",
        showarrow=False,
        xshift=10,
        font=dict(color='black')
    )

fig.show()

print("\nKey Insights:")
print("-------------")
top_features = importance_df.nlargest(3, 'Importance')
print(f"Top 3 most influential features:")
for idx, row in top_features.iterrows():
    print(f"- {row['Feature']}: {row['Importance']:.2f}")

print(f"\nAverage feature importance: {importance_df['Importance'].mean():.2f}")
print(f"Feature importance standard deviation: {importance_df['Importance'].std():.2f}")


Key Insights:
-------------
Top 3 most influential features:
- Annual_FX_Volume: 0.85
- Geographic_Complexity: 0.76
- Cross_Border_Revenue_Pct: 0.72

Average feature importance: 0.62
Feature importance standard deviation: 0.15


In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
import seaborn as sns
from plotly.subplots import make_subplots

np.random.seed(42)
n_samples = 200

credit_ratings = np.random.choice(
    ['AAA', 'AA', 'A', 'BBB', 'BB'],
    n_samples,
    p=[0.1, 0.2, 0.3, 0.25, 0.15]
)

base_volumes = np.random.lognormal(mean=20, sigma=1.5, size=n_samples)
rating_multipliers = {
    'AAA': 2.0,
    'AA': 1.6,
    'A': 1.3,
    'BBB': 1.0,
    'BB': 0.7
}
fx_volumes = [base_volumes[i] * rating_multipliers[rating] for i, rating in enumerate(credit_ratings)]

df = pd.DataFrame({
    'Credit_Rating': credit_ratings,
    'Annual_FX_Volume': fx_volumes
})

summary_stats = df.groupby('Credit_Rating')['Annual_FX_Volume'].agg([
    'count',
    'mean',
    'std',
    'min',
    'max'
]).round(2)

def create_analysis_dashboard():
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Box Plot by Credit Rating',
                       'Average FX Volume by Rating',
                       'Volume Distribution by Rating',
                       'Volume Density by Rating'),
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )

    box_data = go.Box(
        x=df['Credit_Rating'],
        y=df['Annual_FX_Volume'],
        name='FX Volume',
        marker_color='blue'
    )
    fig.add_trace(box_data, row=1, col=1)

    avg_by_rating = df.groupby('Credit_Rating')['Annual_FX_Volume'].mean()
    bar_data = go.Bar(
        x=avg_by_rating.index,
        y=avg_by_rating.values,
        marker_color='green',
        name='Average Volume'
    )
    fig.add_trace(bar_data, row=1, col=2)

    for rating in df['Credit_Rating'].unique():
        violin_data = go.Violin(
            x=df[df['Credit_Rating'] == rating]['Credit_Rating'],
            y=df[df['Credit_Rating'] == rating]['Annual_FX_Volume'],
            name=rating,
            box_visible=True,
            meanline_visible=True
        )
        fig.add_trace(violin_data, row=2, col=1)

    for rating in df['Credit_Rating'].unique():
        hist_data = go.Histogram(
            x=df[df['Credit_Rating'] == rating]['Annual_FX_Volume'],
            name=rating,
            opacity=0.7,
            nbinsx=30
        )
        fig.add_trace(hist_data, row=2, col=2)

    fig.update_layout(
        height=800,
        width=1200,
        title_text="Credit Rating vs Annual FX Volume Analysis",
        showlegend=True
    )

    fig.update_xaxes(title_text="Credit Rating", row=1, col=1)
    fig.update_xaxes(title_text="Credit Rating", row=1, col=2)
    fig.update_xaxes(title_text="Credit Rating", row=2, col=1)
    fig.update_xaxes(title_text="FX Volume", row=2, col=2)

    fig.update_yaxes(title_text="Annual FX Volume", row=1, col=1)
    fig.update_yaxes(title_text="Average FX Volume", row=1, col=2)
    fig.update_yaxes(title_text="Annual FX Volume", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=2)

    return fig

dashboard = create_analysis_dashboard()
dashboard.show()

print("\nSummary Statistics by Credit Rating:")
print("====================================")
print(summary_stats)

ratings_groups = [group['Annual_FX_Volume'].values for name, group in df.groupby('Credit_Rating')]
f_statistic, p_value = stats.f_oneway(*ratings_groups)

print("\nStatistical Analysis:")
print("====================")
print(f"One-way ANOVA p-value: {p_value:.4f}")
print(f"F-statistic: {f_statistic:.4f}")

def correlation_ratio(categories, values):
    categories = pd.Categorical(categories)
    counts = pd.value_counts(categories)
    overall_mean = np.mean(values)
    category_means = pd.Series([np.mean(values[categories == cat]) for cat in counts.index])
    n = len(values)
    numerator = np.sum(counts * (category_means - overall_mean) ** 2)
    denominator = np.sum((values - overall_mean) ** 2)
    return numerator / denominator

correlation_ratio = correlation_ratio(df['Credit_Rating'], df['Annual_FX_Volume'])
print(f"Correlation Ratio (η²): {correlation_ratio:.4f}")

print("\nKey Insights:")
print("=============")
print(f"1. Highest average FX volume: {summary_stats['mean'].idxmax()} rating")
print(f"2. Lowest average FX volume: {summary_stats['mean'].idxmin()} rating")
print(f"3. Largest volume spread (std dev): {summary_stats['std'].idxmax()} rating")
print("4. Statistical significance:", "Strong" if p_value < 0.01 else "Moderate" if p_value < 0.05 else "Weak")


Summary Statistics by Credit Rating:
               count          mean           std           min           max
Credit_Rating                                                               
A                 56  1.982588e+09  5.413558e+09  3.023922e+07  3.731339e+10
AA                44  7.670117e+09  3.773780e+10  6.004998e+06  2.510959e+11
AAA               22  4.742255e+09  8.733458e+09  1.000196e+08  3.124531e+10
BB                31  6.447662e+08  7.532401e+08  2.063352e+07  2.950484e+09
BBB               47  1.225211e+09  1.311451e+09  7.465031e+07  5.614492e+09

Statistical Analysis:
One-way ANOVA p-value: 0.3790
F-statistic: 1.0572
Correlation Ratio (η²): 0.0000

Key Insights:
1. Highest average FX volume: AA rating
2. Lowest average FX volume: BB rating
3. Largest volume spread (std dev): AA rating
4. Statistical significance: Weak



pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.



More advanced analysis:

- Predictive modelling: random forest classifier, XGBoost, ordinal logistic regression as credit ratings are ordinal.
- Clustering analysis; k-means, DBSCAN, hierachical clustering
- Anomaly detection; one-class SVM, LOF local outlier factor


Predictive modelling

In [5]:
# SECTION 1: PREDICTIVE MODELING
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import plotly.graph_objects as go
import plotly.express as px

# Enhance our synthetic dataset
np.random.seed(42)
n_samples = 500

# Create more features for better prediction
data = {
    'Annual_FX_Volume': np.random.lognormal(mean=20, sigma=1.5, size=n_samples),
    'Trading_Frequency': np.random.randint(1, 100, n_samples),
    'Years_Operating': np.random.randint(1, 50, n_samples),
    'Balance_Sheet_Size': np.random.lognormal(mean=25, sigma=2, size=n_samples),
    'Credit_Rating': np.random.choice(['AAA', 'AA', 'A', 'BBB', 'BB'], n_samples)
}

df = pd.DataFrame(data)

# Prepare data for modeling
X = df.drop('Credit_Rating', axis=1)
y = df['Credit_Rating']

# Encode target variable
encoder = OrdinalEncoder()
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded.ravel(),
                                                    test_size=0.2, random_state=42)

# 1. Ordinal Logistic Regression
lr_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_score = lr_model.score(X_test, y_test)

# 2. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_score = rf_model.score(X_test, y_test)

# 3. XGBoost
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=5)
xgb_model.fit(X_train, y_train)
xgb_score = xgb_model.score(X_test, y_test)

# Visualize model comparison
def plot_model_comparison():
    models = ['Logistic Regression', 'Random Forest', 'XGBoost']
    scores = [lr_score, rf_score, xgb_score]

    fig = go.Figure(data=[
        go.Bar(name='Model Accuracy',
               x=models,
               y=scores,
               text=[f'{score:.3f}' for score in scores],
               textposition='auto')
    ])

    fig.update_layout(
        title='Model Performance Comparison',
        yaxis_title='Accuracy Score',
        yaxis=dict(range=[0, 1])
    )
    return fig

# Feature importance plot (using Random Forest)
def plot_feature_importance():
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=True)

    fig = go.Figure(go.Bar(
        x=importance['importance'],
        y=importance['feature'],
        orientation='h'
    ))

    fig.update_layout(
        title='Feature Importance (Random Forest)',
        xaxis_title='Importance Score',
        yaxis_title='Feature'
    )
    return fig

# Display results
print("\nModel Performance Summary:")
print("==========================")
print(f"Logistic Regression Accuracy: {lr_score:.3f}")
print(f"Random Forest Accuracy: {rf_score:.3f}")
print(f"XGBoost Accuracy: {xgb_score:.3f}")

# Show confusion matrix for best model
best_model = rf_model  # assuming RF performs best
y_pred = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nConfusion Matrix (Random Forest):")
print("=================================")
print(conf_matrix)

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_scaled, y_encoded.ravel(), cv=5)
print("\nCross-validation Scores:")
print("========================")
print(f"Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Display visualizations
plot_model_comparison().show()
plot_feature_importance().show()






Model Performance Summary:
Logistic Regression Accuracy: 0.260
Random Forest Accuracy: 0.230
XGBoost Accuracy: 0.210

Confusion Matrix (Random Forest):
[[ 3  2  3  3  5]
 [ 3  2  3  3  4]
 [ 5  2  5  3  9]
 [ 4  3  7  3  7]
 [ 3  0  8  0 10]]

Cross-validation Scores:
Mean CV Score: 0.228 (+/- 0.067)


Clustering Analysis

In [7]:
# SECTION 2: CLUSTERING ANALYSIS
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
import plotly.figure_factory as ff

# Prepare data for clustering
X_cluster = df.drop('Credit_Rating', axis=1)
X_scaled_cluster = StandardScaler().fit_transform(X_cluster)

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled_cluster)

# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled_cluster)

# Hierarchical clustering
linkage_matrix = linkage(X_scaled_cluster, method='ward')

# Visualization functions
def plot_kmeans_clusters():
    fig = px.scatter(
        df, x='Annual_FX_Volume', y='Balance_Sheet_Size',
        color=kmeans_labels.astype(str),
        title='K-means Clustering Results',
        labels={'color': 'Cluster'}
    )
    return fig

def plot_dbscan_clusters():
    fig = px.scatter(
        df, x='Annual_FX_Volume', y='Balance_Sheet_Size',
        color=dbscan_labels.astype(str),
        title='DBSCAN Clustering Results',
        labels={'color': 'Cluster'}
    )
    return fig

def plot_dendrogram():
    fig = ff.create_dendrogram(X_scaled_cluster)
    fig.update_layout(title='Hierarchical Clustering Dendrogram')
    return fig

print("\nClustering Analysis Results:")
print("============================")
print(f"Number of K-means clusters: {len(np.unique(kmeans_labels))}")
print(f"Number of DBSCAN clusters: {len(np.unique(dbscan_labels))}")

plot_kmeans_clusters().show()
plot_dbscan_clusters().show()
plot_dendrogram().show()


Clustering Analysis Results:
Number of K-means clusters: 5
Number of DBSCAN clusters: 2


Anomaly Detection

In [8]:
# SECTION 3: ANOMALY DETECTION
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
import plotly.graph_objects as go

# Prepare data for anomaly detection
X_anomaly = df.drop('Credit_Rating', axis=1)
X_scaled_anomaly = StandardScaler().fit_transform(X_anomaly)

# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest_labels = iso_forest.fit_predict(X_scaled_anomaly)

# Local Outlier Factor
lof = LocalOutlierFactor(contamination=0.1)
lof_labels = lof.fit_predict(X_scaled_anomaly)

# One-class SVM
ocsvm = OneClassSVM(kernel='rbf', nu=0.1)
ocsvm_labels = ocsvm.fit_predict(X_scaled_anomaly)

# Create risk flagging system
def create_risk_flags(iso_forest_pred, lof_pred, ocsvm_pred):
    # Combine predictions from all models
    risk_score = (iso_forest_pred == -1).astype(int) + \
                 (lof_pred == -1).astype(int) + \
                 (ocsvm_pred == -1).astype(int)

    risk_categories = pd.cut(risk_score,
                           bins=[-np.inf, 0, 1, 2, np.inf],
                           labels=['No Risk', 'Low Risk', 'Medium Risk', 'High Risk'])
    return risk_categories

risk_flags = create_risk_flags(iso_forest_labels, lof_labels, ocsvm_labels)

# Visualization functions
def plot_anomaly_detection(method_name, labels):
    fig = px.scatter(
        df, x='Annual_FX_Volume', y='Balance_Sheet_Size',
        color=(labels == -1).astype(str),
        title=f'Anomaly Detection: {method_name}',
        labels={'color': 'Is Anomaly'}
    )
    return fig

def plot_risk_distribution():
    risk_counts = risk_flags.value_counts()
    fig = go.Figure(data=[go.Pie(labels=risk_counts.index,
                                values=risk_counts.values,
                                title='Risk Distribution')])
    return fig

# Display results
print("\nAnomaly Detection Results:")
print("==========================")
print(f"Isolation Forest anomalies: {sum(iso_forest_labels == -1)}")
print(f"LOF anomalies: {sum(lof_labels == -1)}")
print(f"One-class SVM anomalies: {sum(ocsvm_labels == -1)}")

print("\nRisk Distribution:")
print("=================")
print(risk_flags.value_counts())

# Show visualizations
plot_anomaly_detection('Isolation Forest', iso_forest_labels).show()
plot_anomaly_detection('Local Outlier Factor', lof_labels).show()
plot_anomaly_detection('One-class SVM', ocsvm_labels).show()
plot_risk_distribution().show()

# Export high-risk cases
high_risk_cases = df[risk_flags == 'High Risk']
print("\nHigh Risk Cases:")
print("===============")
print(high_risk_cases)


Anomaly Detection Results:
Isolation Forest anomalies: 50
LOF anomalies: 50
One-class SVM anomalies: 51

Risk Distribution:
No Risk        421
Low Risk        38
Medium Risk     10
High Risk       31
Name: count, dtype: int64



High Risk Cases:
     Annual_FX_Volume  Trading_Frequency  Years_Operating  Balance_Sheet_Size  \
14       3.649272e+07                 94               40        4.955462e+12   
16       1.061914e+08                 62                1        5.369575e+12   
24       2.144154e+08                 31                3        8.216112e+12   
35       7.772865e+07                 79               49        3.506056e+13   
49       3.446448e+07                  5               37        2.608615e+13   
105      8.894166e+08                  1               15        7.275862e+12   
106      8.215421e+09                 40               49        3.784458e+10   
113      1.952308e+10                 11               33        3.371400e+11   
125      1.296713e+10                  4               18        3.063561e+10   
156      7.967700e+09                 72                1        5.107703e+11   
161      1.579899e+09                 33               49        4.160741e+12   
165      9