In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

# Generate sample data
np.random.seed(42)

# Create a sample dataset of 100 assets with 252 daily returns
n_assets = 100
n_days = 252

# Define asset classes and their characteristics
asset_classes = {
    'Tech Stocks': {'n': 20, 'volatility': 0.25, 'correlation': 0.7},
    'Banks': {'n': 15, 'volatility': 0.30, 'correlation': 0.8},
    'Utilities': {'n': 15, 'volatility': 0.15, 'correlation': 0.5},
    'Consumer': {'n': 20, 'volatility': 0.20, 'correlation': 0.6},
    'Real Estate': {'n': 15, 'volatility': 0.22, 'correlation': 0.65},
    'Energy': {'n': 15, 'volatility': 0.28, 'correlation': 0.75}
}

# Generate returns for each asset class
returns_data = []
labels = []

for asset_class, params in asset_classes.items():
    # Create correlated returns using Cholesky decomposition
    cov_matrix = np.ones((params['n'], params['n'])) * params['correlation']
    np.fill_diagonal(cov_matrix, 1)
    L = np.linalg.cholesky(cov_matrix)
    
    # Generate correlated random returns
    uncorrelated_returns = np.random.normal(0, params['volatility'], (n_days, params['n']))
    correlated_returns = np.dot(uncorrelated_returns, L.T)
    
    returns_data.append(correlated_returns)
    labels.extend([asset_class] * params['n'])

# Combine all returns
returns = np.hstack(returns_data)
returns_df = pd.DataFrame(returns, columns=[f'Asset_{i}' for i in range(n_assets)])

# Standardize the data
scaler = StandardScaler()
returns_scaled = scaler.fit_transform(returns)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_results = tsne.fit_transform(returns_scaled.T)

# Create visualization
plt.figure(figsize=(12, 8))

# Create color palette
palette = sns.color_palette("husl", len(asset_classes))
color_dict = dict(zip(asset_classes.keys(), palette))

# Plot each asset class
for asset_class in asset_classes:
    mask = [l == asset_class for l in labels]
    plt.scatter(tsne_results[mask, 0], 
                tsne_results[mask, 1],
                label=asset_class,
                alpha=0.7)

plt.title('t-SNE Visualization of Asset Returns\nMarket Structure Analysis', pad=20)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Add annotations for interesting clusters
for i, txt in enumerate(labels):
    if i % 5 == 0:  # Annotate every 5th point to avoid overcrowding
        plt.annotate(f'{txt}_{i}', 
                    (tsne_results[i, 0], tsne_results[i, 1]),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8, alpha=0.7)

# Plot analysis text
analysis_text = """Key Observations:
1. Tech stocks cluster tightly (high correlation)
2. Utilities show dispersion (lower correlation)
3. Banks and Energy show overlap (sector correlation)
4. Clear separation of defensive sectors"""

plt.figtext(0.02, 0.02, analysis_text, fontsize=8, bbox=dict(facecolor='white', alpha=0.8))

# Additional analysis: Calculate and plot centroids
centroids = {}
for asset_class in asset_classes:
    mask = [l == asset_class for l in labels]
    centroids[asset_class] = np.mean(tsne_results[mask], axis=0)
    plt.scatter(centroids[asset_class][0], centroids[asset_class][1], 
                marker='*', s=200, c='black', label='_nolegend_')

# Create distance matrix between centroids
centroid_distances = pd.DataFrame(index=asset_classes.keys(), columns=asset_classes.keys())
for ac1 in asset_classes:
    for ac2 in asset_classes:
        dist = np.linalg.norm(centroids[ac1] - centroids[ac2])
        centroid_distances.loc[ac1, ac2] = dist

print("\nCentroid Distance Matrix:")
print(centroid_distances.round(2))