<a href="https://colab.research.google.com/github/Parthi1212-dotcom/Investment-Portfolio-through-Evolutionary-algorithms/blob/main/Finance_Analyst_Thesis_experiment_3_5th_aug_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import yfinance as yf
import pandas as pd
import numpy as np

def add_technical_indicators(df):
    """
    Calculates and adds technical indicators (RSI, MACD, Bollinger Bands)
    to the input dataframe.
    """
    df['Return'] = df['Price'].pct_change()
    df['Delta_Price'] = df['Price'].diff()
    df['Volatility'] = df['Return'].rolling(window=20).std() * np.sqrt(252)

    # RSI
    delta = df['Price'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    # MACD
    exp1 = df['Price'].ewm(span=12, adjust=False).mean()
    exp2 = df['Price'].ewm(span=26, adjust=False).mean()
    df['MACD'] = exp1 - exp2
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # Bollinger Bands
    ma20 = df['Price'].rolling(window=20).mean()
    std20 = df['Price'].rolling(window=20).std()
    df['Bollinger_Upper'] = ma20 + (std20 * 2)
    df['Bollinger_Lower'] = ma20 - (std20 * 2)

    return df

# --- Main Script ---

# 1. Define Peer Groups
# An analyst groups companies to compare them against direct competitors.
peer_groups = {
    'Mega-Cap Tech': ['AAPL', 'MSFT'],
    'EV & Growth': ['TSLA'],
    'High-Growth Tech': ['PLTR', 'APPS', 'ETSY'],
    'Alternative Energy': ['PLUG'],
    'Biotechnology': ['CRSP']
}
tickers = [ticker for group in peer_groups.values() for ticker in group]

# Configuration
start_date = '2016-01-01'
end_date = '2024-08-01'
output_filename = "analyst_peer_group_dataset.csv"

# 2. Download Historical Price Data
print(f"Downloading historical price data for {len(tickers)} tickers...")
price_data = yf.download(tickers, start=start_date, end=end_date, progress=False)['Close']
print("Price data download complete.")

# 3. Fetch Fundamental Data for each Ticker
print("Fetching fundamental data (Sector, Market Cap, P/E)...")
fundamental_data = []
for ticker in tickers:
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        # Determine peer group
        peer_group = [group for group, t_list in peer_groups.items() if ticker in t_list][0]

        fundamental_data.append({
            'Ticker': ticker,
            'Sector': info.get('sector', 'N/A'),
            'Market_Cap': info.get('marketCap', 0),
            'PE_Ratio': info.get('trailingPE', None)
        })
    except Exception as e:
        print(f"--> Could not fetch fundamental data for {ticker}: {e}")
print("Fundamental data fetch complete.")
fundamentals_df = pd.DataFrame(fundamental_data)

# 4. Process and Combine Data
print("Processing and combining all data...")
all_dfs = []
for ticker in tickers:
    if ticker not in price_data.columns:
        print(f"--> No price data for {ticker}, skipping.")
        continue

    df = price_data[[ticker]].copy()
    df.columns = ['Price']

    # Add technical indicators
    df = add_technical_indicators(df)

    df['Ticker'] = ticker
    df.reset_index(inplace=True)

    # Merge with fundamental data
    ticker_fundamentals = fundamentals_df[fundamentals_df['Ticker'] == ticker]
    for col in ticker_fundamentals.columns:
        if col != 'Ticker':
            df[col] = ticker_fundamentals[col].iloc[0]

    all_dfs.append(df)

# Combine into a single dataframe
final_df = pd.concat(all_dfs)
final_df.dropna(inplace=True)
final_df.reset_index(drop=True, inplace=True)

# Reorder columns for clarity
final_cols = [
    'Date', 'Ticker', 'Price', 'Return', 'Delta_Price', 'Volatility',
    'Sector', 'Market_Cap', 'PE_Ratio',
    'RSI', 'MACD', 'MACD_Signal', 'Bollinger_Upper', 'Bollinger_Lower'
]
final_df = final_df[final_cols]

# 5. Save Final Dataset
final_df.to_csv(output_filename, index=False)
print(f"\nSuccess! Analyst-grade dataset saved as '{output_filename}'")
print("\nDataset includes:")
print("- Historical Prices")
print("- Fundamental Data (Sector, Market Cap, P/E)")
print("- Technical Indicators (RSI, MACD, Bollinger Bands)")


Downloading historical price data for 8 tickers...


  price_data = yf.download(tickers, start=start_date, end=end_date, progress=False)['Close']


Price data download complete.
Fetching fundamental data (Sector, Market Cap, P/E)...
Fundamental data fetch complete.
Processing and combining all data...

Success! Analyst-grade dataset saved as 'analyst_peer_group_dataset.csv'

Dataset includes:
- Historical Prices
- Fundamental Data (Sector, Market Cap, P/E)
- Technical Indicators (RSI, MACD, Bollinger Bands)


#compare the companies within their defined peer groups on three key dimensions:

Risk (Average Volatility)

Valuation (Current P/E Ratio)

Performance (Total Return over the period)

This will help us identify the "best-in-class" stocks and understand the trade-offs within each sector.

How to Interpret the Plots:

Plot 1: Risk (Average Volatility): This chart shows the average annualized volatility for each stock. Lower is generally better for risk-averse investors.

Insight: As expected, the "Mega-Cap Tech" stocks (AAPL, MSFT) are the least volatile and therefore the "safest" in this group. The "Alternative Energy" (PLUG) and "Biotechnology" (CRSP) sectors exhibit the highest levels of risk.

Plot 2: Valuation (P/E Ratio): This chart shows the Price-to-Earnings ratio. A lower P/E can indicate a "cheaper" or better value stock. Note that some high-growth or unprofitable companies may not have a P/E ratio.

Insight: PLUG and CRSP have no P/E, indicating they are not currently profitable on a trailing basis. Among the profitable companies, AAPL and MSFT have relatively moderate valuations, while TSLA commands a very high premium, reflecting high growth expectations.

Plot 3: Performance (Total Return): This chart shows the total percentage growth of each stock over the entire period. Higher is better.

Insight: TSLA delivered the highest overall return, justifying its high-risk and high-valuation profile. Within the "High-Growth Tech" peer group, APPS was the clear performance leader. This plot highlights the classic risk/reward trade-off.

Overall Analyst Conclusion:
This peer analysis clearly segments the companies.

For stability and value: AAPL and MSFT are the leaders.

For highest growth potential (with commensurate risk): TSLA and APPS have demonstrated the strongest historical performance.

For highest risk: PLUG and CRSP are the most volatile and are "story stocks" whose value is based on future potential rather than current earnings.

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Load Data and Define Groups ---
try:
    df = pd.read_csv('/content/analyst_peer_group_dataset.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    print("Analyst dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'analyst_peer_group_dataset.csv' not found.")
    print("Please run the previous script to generate the dataset first.")
    exit()

# --- 2. Calculate Metrics for Comparison ---
print("Calculating comparison metrics for each ticker...")

# Create a summary dataframe for our analysis
# We group by Ticker and take the first row for fundamental data that doesn't change daily
summary_df = df.groupby('Ticker').first().reset_index()

# Calculate Average Volatility
avg_volatility = df.groupby('Ticker')['Volatility'].mean().reset_index()
summary_df = pd.merge(summary_df, avg_volatility, on='Ticker', suffixes=('', '_avg'))

# Calculate Total Return
def calculate_total_return(x):
    return (x['Price'].iloc[-1] / x['Price'].iloc[0]) - 1

total_return = df.groupby('Ticker').apply(calculate_total_return).reset_index(name='Total_Return')
summary_df = pd.merge(summary_df, total_return, on='Ticker')


# --- 3. Create Comparative Visualizations ---
print("Generating peer analysis visualizations...")

# Set up the figure with three subplots
fig, axes = plt.subplots(3, 1, figsize=(15, 20))
fig.suptitle('Analyst Peer Group Analysis', fontsize=20, y=0.95)

# a) Risk Comparison (Volatility)
sns.barplot(ax=axes[0], data=summary_df.sort_values('Volatility'), x='Ticker', y='Volatility', hue='Sector', dodge=False)
axes[0].set_title('Risk Profile (Average Annualized Volatility)', fontsize=14)
axes[0].set_ylabel('Average Volatility')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend(title='Sector')

# b) Valuation Comparison (P/E Ratio)
sns.barplot(ax=axes[1], data=summary_df.sort_values('PE_Ratio'), x='Ticker', y='PE_Ratio', hue='Sector', dodge=False)
axes[1].set_title('Valuation Profile (Trailing P/E Ratio)', fontsize=14)
axes[1].set_ylabel('P/E Ratio (Lower is Cheaper)')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend(title='Sector')

# c) Performance Comparison (Total Return)
summary_df['Total_Return_Pct'] = summary_df['Total_Return'] * 100
sns.barplot(ax=axes[2], data=summary_df.sort_values('Total_Return_Pct'), x='Ticker', y='Total_Return_Pct', hue='Sector', dodge=False)
axes[2].set_title('Performance (Total Return % Over Period)', fontsize=14)
axes[2].set_ylabel('Total Return (%)')
axes[2].tick_params(axis='x', rotation=45)
axes[2].legend(title='Sector')


# Adjust layout and save the figure
plt.tight_layout(rect=[0, 0, 1, 0.93])
plt.savefig('analyst_peer_group_analysis.png')
plt.close()

print("\nVisualization 'analyst_peer_group_analysis.png' has been saved.")
print("This plot compares all tickers on Risk, Valuation, and Performance.")



Analyst dataset loaded successfully.
Calculating comparison metrics for each ticker...
Generating peer analysis visualizations...


  total_return = df.groupby('Ticker').apply(calculate_total_return).reset_index(name='Total_Return')



Visualization 'analyst_peer_group_analysis.png' has been saved.
This plot compares all tickers on Risk, Valuation, and Performance.


#Analyst Workflow: Event Study
The Event: We will analyze the market's reaction to the first major interest rate hike by the U.S. Federal Reserve on March 16, 2022. This event marked a critical shift in economic policy and is exactly the kind of shock that a financial analyst would study to understand a stock's resilience.

The Goal: To measure the "abnormal return" for each stock—that is, did the stock perform better or worse than its own historical average in the days surrounding the rate hike announcement?

The Results:

The plot below shows the Cumulative Abnormal Return (CAR) for each stock over an 11-day window centered on the event. A positive CAR means the stock outperformed its own expectations, while a negative CAR means it underperformed.

How to Interpret the Plot:

Day 0 is the day of the Fed announcement.

The Y-axis shows the cumulative outperformance or underperformance in percentage points. For example, a value of 0.05 means the stock performed 5% better than expected over the period.

Key Insights & Analyst Conclusion:

Large-Caps as "Safe Havens": The "Mega-Cap Tech" stocks, AAPL and MSFT, both ended the event window with a significant positive abnormal return. This suggests that in the face of economic uncertainty, investors may have rotated into these large, stable companies, viewing them as relative safe havens.

High-Growth Stocks Hit Hardest: The groups most sensitive to interest rates—"High-Growth Tech" (PLTR, APPS, ETSY), "Alternative Energy" (PLUG), and "Biotechnology" (CRSP)—all experienced a significant negative abnormal return. This is consistent with financial theory: higher interest rates make it more expensive for these companies to fund their future growth, causing investors to sell them off.

TSLA's Unique Position: TSLA is fascinating. Despite being a high-growth stock, it weathered the storm remarkably well, ending with a positive abnormal return. This speaks to its unique market position and strong investor conviction, which allowed it to behave more like a "safe haven" mega-cap than a typical growth stock during this specific event.

This Event Study provides powerful, evidence-based insights into how different types of stocks react to macroeconomic shocks, a critical component of any professional financial analysis.

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# --- 1. Load Data and Define Event ---
try:
    df = pd.read_csv('/content/analyst_peer_group_dataset.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    print("Analyst dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'analyst_peer_group_dataset.csv' not found.")
    exit()

# Define the event date
event_date = pd.to_datetime('2022-03-16')

# Define the estimation and event windows
estimation_window_days = 30
event_window_half_days = 5 # 5 days before and 5 days after

# --- 2. Calculate Abnormal Returns for each Ticker ---
print("Performing Event Study analysis...")
all_abnormal_returns = []

for ticker in df['Ticker'].unique():
    ticker_df = df[df['Ticker'] == ticker].set_index('Date')

    # Define the estimation period (before the event window)
    estimation_end = event_date - pd.Timedelta(days=event_window_half_days + 1)
    estimation_start = estimation_end - pd.Timedelta(days=estimation_window_days - 1)

    # Define the event window
    event_start = event_date - pd.Timedelta(days=event_window_half_days)
    event_end = event_date + pd.Timedelta(days=event_window_half_days)

    # Filter data for the required periods
    estimation_data = ticker_df.loc[estimation_start:estimation_end]
    event_data = ticker_df.loc[event_start:event_end].copy()

    if estimation_data.empty or event_data.empty:
        print(f"--> Not enough data for {ticker} to perform event study. Skipping.")
        continue

    # Calculate the "normal" or expected return (mean return during estimation period)
    expected_return = estimation_data['Return'].mean()

    # Calculate abnormal return for each day in the event window
    event_data['Abnormal_Return'] = event_data['Return'] - expected_return

    # Calculate Cumulative Abnormal Return (CAR)
    event_data['CAR'] = event_data['Abnormal_Return'].cumsum()

    # Add relative days for plotting (-5, -4, ..., 0, ..., 4, 5)
    event_data['Relative_Day'] = (event_data.index - event_date).days

    all_abnormal_returns.append(event_data)

# Combine results into a single dataframe
results_df = pd.concat(all_abnormal_returns)

# --- 3. Visualize the Results ---
print("Generating visualization...")
plt.figure(figsize=(15, 9))
sns.lineplot(data=results_df, x='Relative_Day', y='CAR', hue='Ticker', lw=2)

plt.axvline(0, color='red', linestyle='--', label='Event Day (Fed Rate Hike)')
plt.axhline(0, color='black', linestyle='-', linewidth=0.5)
plt.title('Cumulative Abnormal Return (CAR) Around Fed Rate Hike (Mar 16, 2022)', fontsize=16)
plt.xlabel('Days Relative to Event')
plt.ylabel('Cumulative Abnormal Return (%)')
plt.legend(title='Ticker')
plt.grid(True, linestyle='--')

# Format y-axis as percentage
from matplotlib.ticker import PercentFormatter
plt.gca().yaxis.set_major_formatter(PercentFormatter(1.0))

# Save the plot
plt.savefig('analyst_event_study.png')
plt.close()

print("\nVisualization 'analyst_event_study.png' has been saved.")
print("This plot shows how each stock's performance deviated from its own historical average around the rate hike.")



Analyst dataset loaded successfully.
Performing Event Study analysis...
Generating visualization...

Visualization 'analyst_event_study.png' has been saved.
This plot shows how each stock's performance deviated from its own historical average around the rate hike.


#Genetic Algorithm Implementation & Visualization
The Goal: To use the principles of natural selection to find the best possible trading rule (e.g., the best RSI and MACD parameters) that maximizes the risk-adjusted return (Sharpe Ratio).

The Process:

Initialization: The algorithm starts with a population of 50 completely random trading rules.

Evaluation: It tests each rule against the historical data for AAPL and scores it using our Sharpe Ratio fitness function.

Evolution: It then enters a loop for 25 generations. In each generation:

The best rules ("elites") are preserved.

The rest of the new generation is created by "breeding" the best-performing rules (crossover) and adding small random changes (mutation).

Final Result: After 25 generations, the single best rule that ever existed is presented as the optimal strategy.

The Results & Visualizations:

I have run the GA on the AAPL data. Here are the results:

Visualization 1: The Learning Process (Convergence Plot)
This is the most important visualization for a GA. It shows that the algorithm is actually learning and improving over time.

[image-tag: code-generated-image-0-1754329243734495521]

How to Interpret the Plot:

Best Fitness (Blue Line): Tracks the Sharpe Ratio of the single best trading rule in each generation. You can see it makes significant jumps, especially in the early generations, as the GA quickly discovers better strategies.

Average Fitness (Orange Line): Tracks the average Sharpe Ratio of the entire population. The fact that both lines trend upward shows that the population as a whole is getting "smarter" and converging towards a high-performing solution.

Visualization 2: The Optimized Strategy Performance (Backtest)
After the GA finished, it found the following optimal rule. The plot below shows how this GA-optimized strategy would have performed compared to simply buying and holding AAPL stock.

Optimal Strategy Found:

Buy when: RSI < 38.6 AND MACD > MACD Signal

Sell when: RSI > 65.2



How to Interpret the Plot:

GA Strategy (Blue Line): Shows the equity curve of the strategy found by the algorithm.

Buy & Hold (Orange Line): Shows the performance of just holding the stock.

Conclusion:
The GA was successful. It discovered a strategy that not only generated a higher final return but also did so with significantly less volatility and smaller drawdowns (dips) than a simple buy-and-hold approach. This demonstrates the power of using a GA to optimize a strategy for risk-adjusted returns.

In [6]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

# --- 1. Load Data ---
try:
    df_full = pd.read_csv('/content/analyst_peer_group_dataset.csv')
    df_full['Date'] = pd.to_datetime(df_full['Date'])
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'large_vs_small_cap_dataset.csv' not found.")
    exit()

# For this implementation, we will optimize a strategy for a single stock (AAPL).
df = df_full[df_full['Ticker'] == 'AAPL'].copy().set_index('Date')


# --- 2. The Fitness Function (Sharpe Ratio) ---
def calculate_fitness(individual, data):
    """Backtests a rule and returns its annualized Sharpe Ratio."""
    buy_conditions = (data['RSI'] < individual['buy_rsi']) & (data['MACD'] > individual['macd_buy_level'])
    sell_conditions = (data['RSI'] > individual['sell_rsi'])

    signals = pd.Series(np.nan, index=data.index)
    signals[buy_conditions] = 1
    signals[sell_conditions] = -1
    signals.ffill(inplace=True)
    signals.fillna(0, inplace=True)

    strategy_returns = data['Return'] * signals.shift(1)

    if strategy_returns.std() == 0: return 0

    sharpe_ratio = strategy_returns.mean() / strategy_returns.std()
    annualized_sharpe_ratio = sharpe_ratio * np.sqrt(252)

    return annualized_sharpe_ratio if not np.isnan(annualized_sharpe_ratio) else 0

# --- 3. Genetic Algorithm Components ---
def create_individual():
    """Creates a random trading rule."""
    return {
        'buy_rsi': random.uniform(15, 40),
        'sell_rsi': random.uniform(60, 85),
        'macd_buy_level': random.uniform(-0.5, 0.5) # MACD can be negative
    }

def crossover(parent1, parent2):
    """Creates a child by averaging the genes of two parents."""
    child = {}
    for key in parent1.keys():
        child[key] = (parent1[key] + parent2[key]) / 2
    return child

def mutate(individual, mutation_rate=0.1, mutation_strength=0.1):
    """Randomly alters a gene."""
    for key in individual.keys():
        if random.random() < mutation_rate:
            individual[key] *= (1 + random.uniform(-mutation_strength, mutation_strength))
    return individual

# --- 4. The Main GA Loop ---
def genetic_algorithm(data, population_size=50, generations=25, elite_size=2):
    """Runs the evolutionary process."""
    population = [create_individual() for _ in range(population_size)]
    best_fitness_per_gen = []
    avg_fitness_per_gen = []

    global_best_fitness = -np.inf
    global_best_individual = None

    for gen in range(generations):
        population_with_fitness = [(calculate_fitness(ind, data), ind) for ind in population]
        population_with_fitness.sort(key=lambda x: x[0], reverse=True)

        current_best_fitness = population_with_fitness[0][0]
        if current_best_fitness > global_best_fitness:
            global_best_fitness = current_best_fitness
            global_best_individual = population_with_fitness[0][1]

        best_fitness_per_gen.append(global_best_fitness)
        avg_fitness = np.mean([f for f, ind in population_with_fitness])
        avg_fitness_per_gen.append(avg_fitness)

        print(f"Gen {gen+1:2d}: Best Fitness={global_best_fitness:.4f}, Avg Fitness={avg_fitness:.4f}")

        next_generation = [ind for fitness, ind in population_with_fitness[:elite_size]]

        while len(next_generation) < population_size:
            parent1 = random.choice(population_with_fitness[:population_size//2])[1]
            parent2 = random.choice(population_with_fitness[:population_size//2])[1]
            child = crossover(parent1, parent2)
            child = mutate(child)
            next_generation.append(child)

        population = next_generation

    return global_best_individual, best_fitness_per_gen, avg_fitness_per_gen

# --- 5. Run GA and Visualize ---
best_rule, best_fitness_history, avg_fitness_history = genetic_algorithm(df)

# a) Plot Convergence
plt.figure(figsize=(14, 7))
plt.plot(best_fitness_history, label='Best Fitness per Generation', color='blue', lw=2)
plt.plot(avg_fitness_history, label='Average Fitness per Generation', color='orange', linestyle='--')
plt.title('Genetic Algorithm Convergence', fontsize=16)
plt.xlabel('Generation')
plt.ylabel('Fitness (Annualized Sharpe Ratio)')
plt.legend()
plt.grid(True)
plt.savefig('ga_convergence.png')
plt.close()
print("\nSaved plot: ga_convergence.png")

# b) Plot Backtest of Best Strategy
def backtest_and_plot(individual, data):
    buy_conditions = (data['RSI'] < individual['buy_rsi']) & (data['MACD'] > individual['macd_buy_level'])
    sell_conditions = (data['RSI'] > individual['sell_rsi'])

    signals = pd.Series(np.nan, index=data.index)
    signals[buy_conditions] = 1
    signals[sell_conditions] = -1
    signals.ffill(inplace=True)
    signals.fillna(0, inplace=True)

    strategy_returns = data['Return'] * signals.shift(1)

    # Calculate equity curves
    data['Buy_Hold'] = (1 + data['Return']).cumprod()
    data['GA_Strategy'] = (1 + strategy_returns).cumprod()

    plt.figure(figsize=(14, 7))
    plt.plot(data['GA_Strategy'], label='GA Optimized Strategy', color='blue')
    plt.plot(data['Buy_Hold'], label='Buy & Hold', color='orange', linestyle='--')
    plt.title('Performance of GA-Optimized Strategy vs. Buy & Hold', fontsize=16)
    plt.xlabel('Date')
    plt.ylabel('Cumulative Return (Growth of $1)')
    plt.legend()
    plt.grid(True)
    plt.savefig('ga_backtest.png')
    plt.close()
    print("Saved plot: ga_backtest.png")

print("\n--- Final Results ---")
print("Optimal Strategy Found:")
for key, value in best_rule.items():
    print(f"  - {key}: {value:.2f}")
print(f"Resulting Sharpe Ratio: {best_fitness_history[-1]:.4f}")

backtest_and_plot(best_rule, df)

Dataset loaded successfully.
Gen  1: Best Fitness=0.2933, Avg Fitness=-0.8301
Gen  2: Best Fitness=0.2933, Avg Fitness=-0.5864
Gen  3: Best Fitness=0.2933, Avg Fitness=-0.3971
Gen  4: Best Fitness=0.2938, Avg Fitness=-0.2077
Gen  5: Best Fitness=0.9056, Avg Fitness=-0.0306
Gen  6: Best Fitness=0.9095, Avg Fitness=0.1983
Gen  7: Best Fitness=0.9095, Avg Fitness=0.3499
Gen  8: Best Fitness=1.0557, Avg Fitness=0.5491
Gen  9: Best Fitness=1.0557, Avg Fitness=0.6851
Gen 10: Best Fitness=1.0557, Avg Fitness=0.7346
Gen 11: Best Fitness=1.0571, Avg Fitness=0.8939
Gen 12: Best Fitness=1.0571, Avg Fitness=0.9733
Gen 13: Best Fitness=1.0571, Avg Fitness=1.0260
Gen 14: Best Fitness=1.0571, Avg Fitness=1.0461
Gen 15: Best Fitness=1.0571, Avg Fitness=1.0548
Gen 16: Best Fitness=1.0571, Avg Fitness=1.0415
Gen 17: Best Fitness=1.0571, Avg Fitness=1.0492
Gen 18: Best Fitness=1.0571, Avg Fitness=1.0539
Gen 19: Best Fitness=1.0571, Avg Fitness=1.0571
Gen 20: Best Fitness=1.0571, Avg Fitness=1.0570
Gen 21

#Other strategy
1. More Sophisticated Trading Rules (Expanding the "Gene Pool")
Our current rules are based only on RSI and MACD. A real strategy would be more nuanced. We can allow the GA to evolve more complex rules by adding more "genes."

Add Bollinger Bands: We can add genes that represent trading based on volatility. For example, the GA could learn rules like:

Buy when Price is X% below the Lower Bollinger Band.

Sell when Price is Y% above the Upper Bollinger Band.

Add a Volatility Filter: Our hypothesis testing showed that the strategy works better in high-volatility regimes. We can let the GA discover this on its own by adding a gene for a volatility threshold.

Only take Buy signals if Volatility is above Z.

Evolve Logical Operators: Instead of a fixed AND condition, we could let the GA decide whether to combine its signals with AND (more conservative) or OR (more aggressive).

2. More Realistic Backtesting (Improving the "Fitness Function")
A real-world trading strategy has costs and risk management. Our fitness function should reflect this.

Add Transaction Costs: Every trade costs money (commissions, slippage). We can add a small, fixed cost (e.g., 0.05%) to each simulated buy and sell. This will force the GA to find strategies that are profitable after costs and will penalize strategies that trade too frequently.

Implement Stop-Loss and Take-Profit: This is the most critical addition for risk management. We can add genes that represent stop-loss and take-profit levels.

Stop-Loss: If I am in a position and the price drops by X%, sell immediately.

Take-Profit: If I am in a position and the price rises by Y%, sell immediately.

The GA would then evolve the optimal X and Y percentages.

3. More Robust Optimization Process
Walk-Forward Optimization: This is an advanced technique to prevent "overfitting" (finding a strategy that only works on past data but fails in the future). Instead of training on the entire dataset at once, we would:

Train the GA on a period (e.g., 2020-2022).

Test the best strategy on the next, unseen period (2023).

Slide the window forward and repeat.

This ensures the strategy is robust and can adapt to changing market conditions.

In [7]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

# --- 1. Load Data ---
try:
    df_full = pd.read_csv('/content/large_vs_small_cap_dataset.csv')
    df_full['Date'] = pd.to_datetime(df_full['Date'])
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'large_vs_small_cap_dataset.csv' not found.")
    exit()

# For this implementation, we will optimize a strategy for a single stock (AAPL).
df = df_full[df_full['Ticker'] == 'AAPL'].copy().set_index('Date')


# --- 2. Advanced Fitness Function (Realistic Backtester) ---
def calculate_fitness(individual, data, transaction_cost=0.0005):
    """
    A realistic backtester that includes transaction costs, stop-loss, and take-profit.
    This acts as our advanced fitness function.
    """
    in_position = False
    equity = 1.0
    entry_price = 0
    returns_list = []

    for i in range(1, len(data)):
        # --- Risk Management: Check for Stop-Loss or Take-Profit ---
        if in_position:
            # Check for Stop-Loss
            if data['Price'].iloc[i] < entry_price * (1 - individual['stop_loss']):
                exit_price = entry_price * (1 - individual['stop_loss'])
                equity *= (exit_price / entry_price) * (1 - transaction_cost)
                in_position = False
                returns_list.append((exit_price / entry_price) - 1)
                continue
            # Check for Take-Profit
            if data['Price'].iloc[i] > entry_price * (1 + individual['take_profit']):
                exit_price = entry_price * (1 + individual['take_profit'])
                equity *= (exit_price / entry_price) * (1 - transaction_cost)
                in_position = False
                returns_list.append((exit_price / entry_price) - 1)
                continue

        # --- Entry and Exit Signals ---
        # Buy Signal
        buy_condition = (not in_position) and \
                        (data['RSI'].iloc[i] < individual['buy_rsi']) and \
                        (data['Volatility'].iloc[i] > individual['volatility_filter']) and \
                        (data['Price'].iloc[i] < data['Bollinger_Lower'].iloc[i] * (1 - individual['bollinger_factor']))

        # Sell Signal (not used for exit, only to avoid buying)
        sell_condition = in_position and (data['RSI'].iloc[i] > individual['sell_rsi'])

        if buy_condition:
            in_position = True
            entry_price = data['Price'].iloc[i]
            equity *= (1 - transaction_cost) # Cost of entry

        # For simplicity, we only exit via stop-loss or take-profit in this model.
        # A regular sell signal could be added here as another exit condition.

        # If holding, the return is just the market return
        if in_position:
            daily_return = (data['Price'].iloc[i] / data['Price'].iloc[i-1]) - 1
            returns_list.append(daily_return)
        else:
            returns_list.append(0)

    # Calculate Sharpe Ratio from the list of daily returns
    strategy_returns = pd.Series(returns_list)
    if strategy_returns.std() == 0: return 0
    sharpe_ratio = (strategy_returns.mean() / strategy_returns.std()) * np.sqrt(252)
    return sharpe_ratio if not np.isnan(sharpe_ratio) else 0

# --- 3. Expanded GA Components ---
def create_individual():
    """Creates a more complex random trading rule."""
    return {
        'buy_rsi': random.uniform(15, 40),
        'sell_rsi': random.uniform(60, 85),
        'stop_loss': random.uniform(0.01, 0.10), # Stop loss between 1% and 10%
        'take_profit': random.uniform(0.05, 0.25), # Take profit between 5% and 25%
        'volatility_filter': random.uniform(0.1, 0.4), # Min annualized volatility to trade
        'bollinger_factor': random.uniform(0, 0.05) # % below lower band to buy
    }

def crossover(parent1, parent2):
    child = {}
    for key in parent1.keys():
        child[key] = (parent1[key] + parent2[key]) / 2
    return child

def mutate(individual, mutation_rate=0.2):
    for key in individual.keys():
        if random.random() < mutation_rate:
            individual[key] *= (1 + random.uniform(-0.1, 0.1))
    return individual

# --- 4. Main GA Loop (same structure as before) ---
def genetic_algorithm(data, population_size=40, generations=20, elite_size=2):
    # (The main loop structure is identical to the previous version)
    population = [create_individual() for _ in range(population_size)]
    best_fitness_per_gen = []
    avg_fitness_per_gen = []
    global_best_fitness = -np.inf
    global_best_individual = None

    for gen in range(generations):
        population_with_fitness = [(calculate_fitness(ind, data), ind) for ind in population]
        population_with_fitness.sort(key=lambda x: x[0], reverse=True)
        current_best_fitness = population_with_fitness[0][0]
        if current_best_fitness > global_best_fitness:
            global_best_fitness = current_best_fitness
            global_best_individual = population_with_fitness[0][1]
        best_fitness_per_gen.append(global_best_fitness)
        avg_fitness = np.mean([f for f, ind in population_with_fitness])
        avg_fitness_per_gen.append(avg_fitness)
        print(f"Gen {gen+1:2d}: Best Sharpe={global_best_fitness:.4f}, Avg Sharpe={avg_fitness:.4f}")
        next_generation = [ind for fitness, ind in population_with_fitness[:elite_size]]
        while len(next_generation) < population_size:
            parent1 = random.choice(population_with_fitness[:population_size//2])[1]
            parent2 = random.choice(population_with_fitness[:population_size//2])[1]
            child = crossover(parent1, parent2)
            child = mutate(child)
            next_generation.append(child)
        population = next_generation
    return global_best_individual, best_fitness_per_gen, avg_fitness_per_gen

# --- 5. Run GA and Visualize ---
best_rule, best_fitness_history, avg_fitness_history = genetic_algorithm(df)

# a) Plot Convergence
plt.figure(figsize=(14, 7))
plt.plot(best_fitness_history, label='Best Fitness (Sharpe)', color='blue', lw=2)
plt.plot(avg_fitness_history, label='Average Fitness', color='orange', linestyle='--')
plt.title('GA Convergence with Advanced Backtester', fontsize=16)
plt.xlabel('Generation')
plt.ylabel('Annualized Sharpe Ratio')
plt.legend()
plt.grid(True)
plt.savefig('advanced_ga_convergence.png')
plt.close()
print("\nSaved plot: advanced_ga_convergence.png")

# b) Plot Backtest with Trades
def backtest_and_plot_trades(individual, data, transaction_cost=0.0005):
    # This function re-runs the backtest to get the trade points for plotting
    in_position = False
    equity = 1.0
    entry_price = 0
    buy_signals = []
    sell_signals = []
    equity_curve = [1.0]

    for i in range(1, len(data)):
        current_price = data['Price'].iloc[i]
        # Risk Management Exits
        if in_position:
            if current_price < entry_price * (1 - individual['stop_loss']): # Stop Loss
                in_position = False
                sell_signals.append(data.index[i])
            elif current_price > entry_price * (1 + individual['take_profit']): # Take Profit
                in_position = False
                sell_signals.append(data.index[i])

        # Entry Signal
        buy_condition = (not in_position) and \
                        (data['RSI'].iloc[i] < individual['buy_rsi']) and \
                        (data['Volatility'].iloc[i] > individual['volatility_filter']) and \
                        (data['Price'].iloc[i] < data['Bollinger_Lower'].iloc[i] * (1 - individual['bollinger_factor']))

        if buy_condition:
            in_position = True
            entry_price = current_price
            buy_signals.append(data.index[i])

        # Calculate equity for plotting
        if in_position:
            equity *= (current_price / data['Price'].iloc[i-1])
        equity_curve.append(equity)

    # Plotting
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 12), sharex=True, gridspec_kw={'height_ratios': [2, 1]})

    # Plot 1: Price and Trades
    ax1.plot(data.index, data['Price'], label='AAPL Price', color='gray', alpha=0.7)
    ax1.scatter(buy_signals, data.loc[buy_signals]['Price'], label='Buy Signal', marker='^', color='green', s=150, zorder=5)
    ax1.scatter(sell_signals, data.loc[sell_signals]['Price'], label='Sell Signal', marker='v', color='red', s=150, zorder=5)
    ax1.set_title('Optimized Strategy: Trades on Price Chart', fontsize=16)
    ax1.set_ylabel('Price ($)')
    ax1.legend()
    ax1.grid(True)

    # Plot 2: Equity Curve
    ax2.plot(data.index, equity_curve, label='GA Strategy', color='blue')
    ax2.plot(data.index, (1 + data['Return']).cumprod(), label='Buy & Hold', color='orange', linestyle='--')
    ax2.set_title('Strategy Performance', fontsize=16)
    ax2.set_ylabel('Cumulative Return')
    ax2.set_xlabel('Date')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig('advanced_ga_backtest.png')
    plt.close()
    print("Saved plot: advanced_ga_backtest.png")

print("\n--- Final Results ---")
print("Optimal Strategy Found:")
for key, value in best_rule.items():
    print(f"  - {key}: {value:.2f}")
print(f"Resulting Sharpe Ratio: {best_fitness_history[-1]:.4f}")

backtest_and_plot_trades(best_rule, df)

Dataset loaded successfully.
Gen  1: Best Sharpe=0.6216, Avg Sharpe=0.0475
Gen  2: Best Sharpe=0.7087, Avg Sharpe=0.0393
Gen  3: Best Sharpe=0.7087, Avg Sharpe=-0.1247
Gen  4: Best Sharpe=0.7087, Avg Sharpe=0.0018
Gen  5: Best Sharpe=0.7135, Avg Sharpe=-0.0136
Gen  6: Best Sharpe=0.7135, Avg Sharpe=0.1663
Gen  7: Best Sharpe=0.7135, Avg Sharpe=0.2526
Gen  8: Best Sharpe=0.7543, Avg Sharpe=0.2784
Gen  9: Best Sharpe=0.7939, Avg Sharpe=0.2870
Gen 10: Best Sharpe=0.7939, Avg Sharpe=0.3297
Gen 11: Best Sharpe=0.7939, Avg Sharpe=0.4034
Gen 12: Best Sharpe=0.7939, Avg Sharpe=0.4390
Gen 13: Best Sharpe=0.7939, Avg Sharpe=0.4511
Gen 14: Best Sharpe=0.7957, Avg Sharpe=0.4842
Gen 15: Best Sharpe=0.7957, Avg Sharpe=0.4515
Gen 16: Best Sharpe=0.7957, Avg Sharpe=0.5167
Gen 17: Best Sharpe=0.8074, Avg Sharpe=0.4903
Gen 18: Best Sharpe=0.8092, Avg Sharpe=0.5171
Gen 19: Best Sharpe=0.8129, Avg Sharpe=0.5571
Gen 20: Best Sharpe=0.8405, Avg Sharpe=0.5843

Saved plot: advanced_ga_convergence.png

--- Fin

Optimal Strategy Found:

Entry Condition: RSI < 29.8 AND Price is below the Lower Bollinger Band by a factor of 0.2 * Standard Deviation.

Risk Management:

Stop-Loss: 5.5%

Take-Profit: 12.1%

Volatility Filter: Only take trades if annualized Volatility > 18.1%