In [1]:
#!pip install yfinance

# Invest Portfolio

Ivan has a large amout of money, and he starts to think how to invest this money to gain maximum profit.
He chooses 10 candidate companies for that, but since Ivan is new in investing he needs your advice. In this notebook we will help Ivan in his journey.      

In [2]:
# 10 companies to consider, key - ticker name, value -  short description.
tickers = {
    "BA": "The Boeing Company",
    "MCD": "McDonald's Corporation",
    "GOOGL": "Google Company",
    "DIS": "The Walt Disney Company",
    "PFE": "Pfizer Inc.",
    "AMT": "American Tower Corporation",
    "VOD": "Vodafone Group Public Limited Company",
    "BNP.PA": "BNP Paribas Bank",
    "ADS.DE": "Adidas",
    "TM": "Toyota Motor Corporation",
}

## Task 1 Download and preprocess data

We will use data from 2010 till 2021 for training and data from 2021 till 2024  for testing purpose.



In [3]:
start_train_date = "2010-01-01"
end_train_date = "2021-01-01"
start_test_date = "2021-01-01"
end_test_date = "2024-01-01"

Write a function that takes ticker name, start and end dates and returns pandas series of annual stock prices.



In [4]:
import yfinance as yf
import pandas as pd
import numpy as np


def download_annual_stock_prices(tickers_list, buy_date="2010-01-01", sell_date="2021-01-01"):
    """Download the annual stock price using yf.download"""
    data_df = yf.download(tickers_list, start=buy_date, end=sell_date, interval="1mo", progress=False)
    return data_df["Open"][::12]

In [5]:
annual_stock_price_df = download_annual_stock_prices(
    list(tickers), start_train_date, end_train_date
)
test_annual_stock_price_df = download_annual_stock_prices(
    list(tickers), start_test_date, end_test_date
)

In [6]:
test_annual_stock_price_df

Ticker,ADS.DE,AMT,BA,BNP.PA,DIS,GOOGL,MCD,PFE,TM,VOD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-01,297.899994,226.070007,210.0,43.105,182.259995,88.0,214.490005,36.869999,155.809998,17.040001
2022-01-01,254.350006,292.369995,204.0,61.110001,155.830002,145.054993,269.48999,58.5,186.809998,15.2
2023-01-01,128.339996,214.0,192.949997,53.91,88.980003,89.589996,263.529999,51.009998,137.960007,10.29


In [7]:
annual_stock_price_df

Ticker,ADS.DE,AMT,BA,BNP.PA,DIS,GOOGL,MCD,PFE,TM,VOD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-01,37.77,43.490002,55.720001,55.900002,32.5,15.689439,62.630001,17.333965,84.75,23.649338
2011-01-01,49.279999,51.91,66.150002,48.299999,37.740002,14.926927,77.099998,16.793169,79.019997,26.941896
2012-01-01,49.509998,60.450001,74.699997,30.450001,37.970001,16.33984,101.330002,20.740038,67.389999,28.69521
2013-01-01,67.330002,78.389999,76.550003,42.584999,50.799999,18.003504,89.400002,24.222012,94.779999,26.258919
2014-01-01,92.639999,79.510002,136.009995,56.650002,76.040001,27.914413,96.809998,28.908918,121.599998,39.755352
2015-01-01,57.619999,99.160004,131.070007,49.259998,94.910004,26.629999,94.129997,29.667933,126.230003,34.080002
2016-01-01,89.910004,96.290001,141.380005,52.59,103.120003,38.110001,117.25,30.218216,121.519997,32.119999
2017-01-01,149.75,106.730003,156.300003,60.119999,105.300003,40.030998,121.860001,31.024668,118.169998,24.84
2018-01-01,167.149994,143.160004,295.75,62.25,108.949997,52.651001,173.729996,34.516129,127.43,32.16
2019-01-01,182.399994,157.110001,316.190002,39.474998,108.099998,51.360001,175.410004,40.910816,114.68,19.299999



Let's define $\xi_i$ as random variable equals annual return ratio of the $\text{stock}_i$  relative to the buying price.

Convert annual stock price to annual return ratio considering that we've bought stock at moment $t=0$.
i.e. if the annoaul stock prices are $(10, 15, 12, 13)$,
 annual return ratio will be $(0.5, -0.3, 0.1)$

In [8]:
def get_annual_return_ratio_from_initial(prices_df):
    diff_df = prices_df.diff(1).dropna()
    return diff_df.div(prices_df.iloc[0])


test_stock_price_df = pd.DataFrame(
    {
        "ticker_1": [1, 3, 2, 5],
        "ticker_2": [10, 15, 12, 13],
    }
)
test_annual_return_ratio_df = pd.DataFrame(
    {
        "ticker_1": [2.0, -1.0, 3.0],
        "ticker_2": [0.5, -0.3, 0.1],
    }
)
result_np = get_annual_return_ratio_from_initial(test_stock_price_df).values
np.testing.assert_array_equal(result_np, test_annual_return_ratio_df.values)

In [9]:
annual_return_ratio_df = get_annual_return_ratio_from_initial(annual_stock_price_df)
test_annual_return_ratio_df = get_annual_return_ratio_from_initial(test_annual_stock_price_df)

## Task 2 Simple approach

Let's invest all money in most profitable company but for that we need to calculate $E\xi_i$ first.

We can calculate this value from company annual return ratio and from stock price. Let's do both to make sure that we do everything right.

In [10]:
eps = 1e-7


def get_expectation_ratio_from_return_ratio(ticker_return_ratio_df):
    return ticker_return_ratio_df.mean()


def get_expectation_ratio_from_stock_price(ticker_stock_price_df):
    pct_price_change_df = (ticker_stock_price_df.iloc[-1] / ticker_stock_price_df.iloc[0]) - 1
    return pct_price_change_df / (len(ticker_stock_price_df) - 1)

In [11]:
expectation_ratio_1 = get_expectation_ratio_from_return_ratio(annual_return_ratio_df)
expectation_ratio_2 = get_expectation_ratio_from_stock_price(annual_stock_price_df)
assert sum(abs(expectation_ratio_1 - expectation_ratio_2)) < eps

Looks like we are ready to implement our greedy approach:

$$ \text{company-to-invest} = \max_i E \xi_i $$

Let's implement this approach having tickers and dates of buying and selling:

In [12]:
def get_expectation_ratio_for_greedy_performance(prices_df):
    return get_expectation_ratio_from_stock_price(prices_df)


def find_ticker_for_greedy_selection(prices_df):
    expectation_df = get_expectation_ratio_from_stock_price(prices_df)
    return expectation_df.idxmax()

In [13]:
aapl_df = download_annual_stock_prices(["AAPL"], start_train_date, end_train_date)
assert 0.8 < get_expectation_ratio_from_stock_price(aapl_df) < 0.9

In [14]:
best_greedy_ticker = find_ticker_for_greedy_selection(annual_stock_price_df)
greedy_ticker_train_performance = get_expectation_ratio_for_greedy_performance(
    annual_stock_price_df[best_greedy_ticker]
)
greedy_ticker_test_performance = get_expectation_ratio_for_greedy_performance(
    test_annual_stock_price_df[best_greedy_ticker]
)
print(f"\n ticker to invest in greedy approach: {best_greedy_ticker}")
print(f"Train expectation ratio: {greedy_ticker_train_performance}")
print(f"Test expectation ratio: {greedy_ticker_test_performance}")


 ticker to invest in greedy approach: ADS.DE
Train expectation ratio: 0.6672755739493247
Test expectation ratio: -0.2845921467482695


Looks like our greedy method is not very helpful.

## Task 3 Variance

If stock has big variance it means that investor cannot be sure about the future. What is the variance in our case?

Your task is to find $D\xi_i$

In [15]:
def get_variance_from_return_ratio(ticker_return_ratio_df):
    return ticker_return_ratio_df.var(ddof=0)


def get_ticker_variance(prices_df):
    return_ratio = get_annual_return_ratio_from_initial(prices_df)
    return get_variance_from_return_ratio(return_ratio)

In [16]:
aapl_df = download_annual_stock_prices(["AAPL"], start_train_date, end_train_date)
assert 1.9 < get_ticker_variance(aapl_df) < 2.0

Then let's reformulate our task, we will select stock to invest by maximizing the following:

$$\text{company-to-invest} = \max_i \left( \frac{E\xi_i} {\sqrt{ D\xi_i}} \right)$$

In [17]:
def calc_expectation_ratio_divided_by_variance(prices_df):
    expectation_df = get_expectation_ratio_from_stock_price(prices_df)
    variance_df = get_ticker_variance(prices_df)
    return expectation_df.div(variance_df.pow(0.5))


def find_ticker_for_max_expectation_divided_by_var(prices_df):
    return calc_expectation_ratio_divided_by_variance(prices_df).idxmax()

In [18]:
best_expectation_var_ticker = find_ticker_for_max_expectation_divided_by_var(
    annual_stock_price_df
)
expectation_var_train_performance = get_expectation_ratio_for_greedy_performance(
    annual_stock_price_df[best_expectation_var_ticker]
)
greedy_ticker_test_performance = get_expectation_ratio_for_greedy_performance(
    test_annual_stock_price_df[best_expectation_var_ticker]
)

print(f"\n ticker to invest in expectation-variance approach: {best_expectation_var_ticker}")
print(f"Train expectation ratio: {expectation_var_train_performance}")
print(f"Test expectation ratio: {greedy_ticker_test_performance}")


 ticker to invest in expectation-variance approach: DIS
Train expectation ratio: 0.347046133188101
Test expectation ratio: -0.25589815088687423


Still not good...

## Task 4 Linear combination

In investing, there is a golden rule not to put all your eggs in one basket. So why shouldn't Ivan diversify his investments?

Let's define the weight vector $w = (w_1, \ldots, w_n)$ such that $|w| = \sum_{i} w_i = 1;$ and  $ 0 \le w_i \le 1$ .

Our solution can then be represented as:
$\xi = \sum_i w_i \xi_i$

**Task:** Expand the final formula

$$ \max_w \left (  \frac{E[\xi]} { \sqrt{D[\xi]} } \right)   =
    \max_w \left (  \frac{E \left[ \sum_i w_i \xi_i \right] } { \sqrt{D \left[ \sum_i w_i \xi_i \right] } } \right)  = \ldots
$$  



To find best vector $w$ we have to create a function that returns our metric having vector $w$

In [19]:
def get_expectation(prices_df, weights):
    """Calculate expected profit if we invest in each ticker with certain weight."""
    expectation_df = get_expectation_ratio_from_stock_price(prices_df)
    return expectation_df.mul(weights).sum()


def create_functional(priced_df):
    annual_ratio_df: pd.DataFrame = get_annual_return_ratio_from_initial(priced_df)

    def wrapper(weights):
        weighted_annual_ratio_sum_df = annual_ratio_df.mul(weights).sum(axis=1)  # sum(w_i * xi_i)

        weighted_expectation = weighted_annual_ratio_sum_df.mean()
        weighted_variance = weighted_annual_ratio_sum_df.var(ddof=0)
        return weighted_expectation / np.sqrt(weighted_variance)

    return wrapper


functional = create_functional(annual_stock_price_df)

equal_weights = np.array([1 / len(tickers) for _ in tickers])
metric_for_equal_weights = functional(equal_weights)

assert 1.0 < metric_for_equal_weights < 1.5

In [20]:
metric_for_equal_weights

1.0415405684382606

## Task 5 Optimization

But how to find best weights?

Let's define optimization problem and use _minimize_ function from _sicpy_ package.

**Your task** Find best weights and evaluate train and test results.

In [21]:
from scipy.optimize import minimize

In [22]:
def constraint(x):
    return sum(x) - 1


con = {"type": "eq", "fun": constraint}
bounds = [(0, 1) for _ in range(len(tickers))]
minimization_functional = lambda weights: -functional(weights)

task_5_result = minimize(
    minimization_functional, equal_weights, constraints=con, bounds=bounds
)

print("Optimal Weights:", task_5_result.x.round(4))
print("\nTrain Expected Profit:", get_expectation(annual_stock_price_df, task_5_result.x))
print("Test Expected Profit:", get_expectation(test_annual_stock_price_df, task_5_result.x))

Optimal Weights: [0.0311 0.0295 0.     0.     0.1975 0.     0.1565 0.5854 0.     0.    ]

Train Expected Profit: 0.20314169520939473
Test Expected Profit: 0.06996977756443773


## Task 6 Bonus: Experiment with initial weights


For running mimimzation algorithm we need to define initial vector $w_0$, but how to do that?

One of the approaches is to try $N$ different starting points and select the best.

**Your task**
1. Implement algorithm that generates $N$ random vectors for weights then tries to minimize functional $N$ times using these vectors as a starting points. And returns the best founded weights.
2. Apply this  algorithm for our task and compare results


In [23]:
from tqdm import tqdm


def generate_random_weights(vector_size):
    weights = np.random.rand(vector_size)
    weights /= np.sum(weights)
    return weights


def random_search_for_initial_weights(functional_to_minimize, n_tries, vector_size):
    best_func_value = np.inf
    best_weights = None
    jac = None
    for _ in tqdm(range(n_tries)):
        weights = generate_random_weights(vector_size)
        res = minimize(functional_to_minimize, weights, constraints=con, bounds=bounds)
        if res.fun < best_func_value:
            best_func_value = res.fun
            best_weights = res.x
            jac = res.jac

    return best_weights, best_func_value, -jac


optimal_weights, func_value, func_jac = random_search_for_initial_weights(
    minimization_functional, 100, vector_size=len(tickers)
)

print("Optimal weights:", optimal_weights.round(4))
print("\ntrain expectation:", get_expectation(annual_stock_price_df, optimal_weights))
print("test expectation:", get_expectation(test_annual_stock_price_df, optimal_weights))

test_task_5_expectation = get_expectation(test_annual_stock_price_df, task_5_result.x)
test_task_6_expectation = get_expectation(test_annual_stock_price_df, optimal_weights)
print(
    "\nDifference between task 5 and 6:",
    (test_task_6_expectation - test_task_5_expectation) * 100,  # result in pct %
)

100%|██████████| 100/100 [00:11<00:00,  8.37it/s]

Optimal weights: [0.0311 0.0296 0.     0.     0.1975 0.     0.1564 0.5855 0.     0.    ]

train expectation: 0.20312829774854413
test expectation: 0.06998269178150793

Difference between task 5 and 6: 0.0012914217070197909





In [24]:
test_annual_stock_price_df

Ticker,ADS.DE,AMT,BA,BNP.PA,DIS,GOOGL,MCD,PFE,TM,VOD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-01,297.899994,226.070007,210.0,43.105,182.259995,88.0,214.490005,36.869999,155.809998,17.040001
2022-01-01,254.350006,292.369995,204.0,61.110001,155.830002,145.054993,269.48999,58.5,186.809998,15.2
2023-01-01,128.339996,214.0,192.949997,53.91,88.980003,89.589996,263.529999,51.009998,137.960007,10.29


Prices have dropped significantly in test data, this explains the difference between train and test expectations (0.203 and 0.0699)

And let's try to see dependencies between expectation divided by variance for train and test data. Compare it with weights and jacobian.


In [25]:
train_df = calc_expectation_ratio_divided_by_variance(annual_stock_price_df).to_frame(name="train E/var").T
test_df = calc_expectation_ratio_divided_by_variance(test_annual_stock_price_df).to_frame(name="test E/var").T
weights_df = pd.DataFrame(optimal_weights, index=annual_stock_price_df.keys(), columns=["weights"]).T
jac_df = pd.DataFrame(func_jac, index=annual_stock_price_df.keys(), columns=["jac"]).T
pd.concat([train_df, test_df, weights_df, jac_df]).astype(np.float16)

Ticker,ADS.DE,AMT,BA,BNP.PA,DIS,GOOGL,MCD,PFE,TM,VOD
train E/var,0.708496,0.898926,0.668457,-0.024704,0.959473,0.826172,0.791016,0.714844,0.372803,-0.059875
test E/var,-2.056641,-0.083435,-3.376953,0.428711,-2.308594,0.01413,0.804688,0.485596,-0.223511,-2.199219
weights,0.031097,0.029556,0.0,0.0,0.19751,0.0,0.156372,0.585449,0.0,0.0
jac,0.008476,0.00042,-5.679688,-1.791016,-0.003088,-2.84375,0.001758,0.000102,-1.797852,-3.927734



For me, it seems like there is no correlations :(

### Let's try to calc weighted func value in another way:


$$ 
    \max_w \left (  \frac{E[\xi]} { \sqrt{D[\xi]} } \right)   =
    \max_w \left (  \frac{E \left[ \sum_i w_i \xi_i \right] } { \sqrt{D \left[ \sum_i w_i \xi_i \right] } } \right)  = 
    \max_w \left ( \frac{\sum_i w_i E_{\xi_i} } { \sqrt{\sum_i w_i^2 D_{\xi_i} + 2 \sum_{i<j} w_i w_j \cdot \text{cov}\left(\xi_i, \xi_j \right)} } \right)
$$  

And let's estimate time complexity: 
* n - number of tickers (columns)
* t - number of values (rows)

In [26]:
def create_functional_1(priced_df):
    """Same as create_functional, to compare with optimized version."""
    annual_ratio_df = get_annual_return_ratio_from_initial(priced_df)

    def wrapper(weights):
        weighted_annual_ratio_sum_df = annual_ratio_df.mul(weights).sum(axis=1)  # O(nt)
        weighted_expectation = weighted_annual_ratio_sum_df.mean()  # O(t)
        weighted_variance = weighted_annual_ratio_sum_df.var(ddof=0)  # O(t)
        return weighted_expectation / np.sqrt(weighted_variance)  # O(1)
        # time complexity: O(nt)

    return wrapper


def create_functional_optimized(priced_df):
    annual_ratio_df = get_annual_return_ratio_from_initial(priced_df)
    mean_annual_ratio = annual_ratio_df.mean().values
    cov_matrix = annual_ratio_df.cov(ddof=0).values

    def wrapper(weights):
        numerator = weights @ mean_annual_ratio  # O(n)
        denominator = (weights @ cov_matrix) @ weights  # O(n^2)
        return numerator / np.sqrt(denominator)  # O(1)
        # time complexity: O(n^2)

    return wrapper


functional_1 = create_functional_1(annual_stock_price_df)
print("functional_1:", functional_1(optimal_weights))

functional_2 = create_functional_optimized(annual_stock_price_df)
print("functional_2:", functional_2(optimal_weights))

functional_1: 2.2500800952843303
functional_2: 2.2500800952843303


Let's generate random values to compare speed of these functions.

In [27]:
N_random_values = 3 * 10**5
n_iters = 50
min_ratio = -2
max_ratio = 10

random_annual_ratio_df = pd.DataFrame(
    np.random.uniform(-min_ratio, max_ratio, size=(N_random_values, len(tickers))), columns=tickers
)
random_annual_ratio_df.head(3)

Unnamed: 0,BA,MCD,GOOGL,DIS,PFE,AMT,VOD,BNP.PA,ADS.DE,TM
0,2.42029,6.641535,2.070968,8.566358,5.942292,4.288057,8.015594,7.190353,2.500859,7.180891
1,6.537501,9.562998,6.296112,2.984419,2.720833,2.860806,2.975056,4.418972,3.256596,9.19015
2,7.523851,2.511417,8.305059,9.036915,7.674577,6.869314,8.644149,4.373368,2.593795,4.715986


In [28]:
functional_1 = create_functional_1(random_annual_ratio_df)
functional_2 = create_functional_optimized(random_annual_ratio_df)

In [29]:
np.random.seed(42)

func_1_value = random_search_for_initial_weights(
    functional_1, n_iters, vector_size=len(tickers)
)
print("functional_1:", func_1_value[1])

100%|██████████| 50/50 [00:26<00:00,  1.92it/s]

functional_1: -1.5823895257780916e-06





In [30]:
np.random.seed(42)

func_2_value = random_search_for_initial_weights(
    functional_2, n_iters, vector_size=len(tickers)
)
print("functional_2:", func_2_value[1])

100%|██████████| 50/50 [00:00<00:00, 955.60it/s]

functional_2: -1.5823895257788164e-06





Significant difference: 2 iters/sec vs 1100 iters/sec =)