In [None]:
import qichang
import requests
import pandas as pd
from tqdm import tqdm

In [35]:
stock_lst = ["AAPL", "GOOGL", "AMZN", "MSFT", "TSLA", "META", "NVDA", "PYPL", "NFLX"]
start_date = "2023-06-01"
end_date = "2024-03-20"

In [None]:
llm = qichang.LLM_API()

In [38]:
prompt0 = """
You are a stock analyst, and your task is to predict the stock price of {}. You can only request the stock price from {} to {}, then you need to predict the moving direction of this stock in next week based on the information provided. You MUST select and return your answer among [up, down], DO NOT return any other answer. For example: down
"""

In [None]:
prompt1 = """
You are a stock analyst, and your task is to predict the stock price of {}. You can request the stock price and stock news data from {} to {}, then you need to predict the moving direction of this stock in next week based on the information provided. You MUST select and return your answer among [up, down], DO NOT return any other answer. For example: down
"""

In [39]:
yf_address = 'http://api.qichangzheng.net/yfinance_api'

In [40]:
res = requests.post(yf_address, json={'tickers': stock_lst, 'start_date': start_date, 'end_date': end_date})

In [41]:
data = res.json()['price_data']

# Iterate through the dictionary to transform the data
rows = [{**{'date': date}, **{ticker: prices[date] for ticker, prices in data.items()}} for date in data['AAPL']]

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(rows)

In [42]:
# resample the date to weekly
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.resample('W').mean()
df.reset_index(inplace=True)

In [43]:
date = df['date'].tolist()

In [44]:
result = {}
for company in stock_lst:
    result[company] = []
    for i in tqdm(range(len(date)-1)):
        start_date = date[i].strftime('%Y-%m-%d')
        end_date = date[i+1].strftime('%Y-%m-%d')
        pmt = prompt0.format(company, start_date, end_date)
        while True:
            res = llm.chat('stock_api', pmt, timeout=30)
            if res in ['up', 'down']:
                break
        result[company].append(res)

100%|██████████| 42/42 [06:23<00:00,  9.12s/it]
100%|██████████| 42/42 [05:41<00:00,  8.13s/it]
100%|██████████| 42/42 [04:04<00:00,  5.83s/it]
100%|██████████| 42/42 [06:15<00:00,  8.94s/it]
100%|██████████| 42/42 [05:09<00:00,  7.36s/it]
100%|██████████| 42/42 [07:05<00:00, 10.13s/it]
100%|██████████| 42/42 [05:32<00:00,  7.91s/it]
100%|██████████| 42/42 [05:51<00:00,  8.37s/it]
100%|██████████| 42/42 [05:03<00:00,  7.22s/it]


In [57]:
# create a dataframe to store the result
result_df = pd.DataFrame(result)
result_df['date'] = date[1:]
result_df.set_index('date', inplace=True)


In [58]:
result_df

Unnamed: 0_level_0,AAPL,GOOGL,AMZN,MSFT,TSLA,META,NVDA,PYPL,NFLX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-06-11,up,down,up,up,up,up,up,up,up
2023-06-18,up,up,up,up,up,up,up,up,up
2023-06-25,up,up,up,up,down,up,down,up,down
2023-07-02,up,up,up,up,up,up,up,up,up
2023-07-09,up,up,up,up,down,up,up,up,down
2023-07-16,up,up,up,up,up,up,up,up,up
2023-07-23,down,down,up,up,down,up,down,up,down
2023-07-30,up,up,up,up,up,up,up,up,up
2023-08-06,down,down,up,down,down,down,down,down,down
2023-08-13,up,down,up,down,down,down,down,down,down


In [59]:
result_df.to_csv('GPT4_0.csv')

In [113]:
result_df = pd.read_csv('GPT4_0.csv')

In [114]:
# remove the last row
result_df = result_df.iloc[1:]

In [None]:
len(result_df)

In [None]:
df

In [0]:
returns

In [117]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score

# Assuming df and result_df are already defined
# Calculate daily returns for the tickers
returns = df.set_index('date').pct_change().dropna()

# Define the function to categorize the movement based on percentage change
def categorize_movement(x):
    if x >= 0:
        return 'up'
    elif x < 0:
        return 'down'

# Apply categorization to returns to create the actual movement DataFrame
actual_movements = returns.applymap(categorize_movement)

# Align the actual_movements DataFrame with the result_df by dropping the first and last rows
actual_movements = actual_movements.iloc[:-1]

# Initialize dictionaries to hold the results
sharpe_ratios = {}
confusion_matrices = {}
accuracy_scores = {}
f1_scores = {}

for ticker in returns.columns:
    # Calculate Sharpe Ratio
    mean_return = returns[ticker].mean()
    sd_return = returns[ticker].std()
    sharpe_ratio = mean_return / sd_return
    sharpe_ratios[ticker] = sharpe_ratio
    
    # Calculate Confusion Matrix
    y_true = actual_movements[ticker]
    y_pred = result_df[ticker]
    cm = confusion_matrix(y_true, y_pred, labels=['up', 'down'])
    confusion_matrices[ticker] = cm
    
    # Calculate Accuracy Rate
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, labels=['up', 'down'], average='weighted')

    accuracy_scores[ticker] = accuracy
    f1_scores[ticker] = f1

# Initialize dictionaries to hold the accuracy and F1 score

# Print the results
print("Accuracy Rates:")
for ticker, acc in accuracy_scores.items():
    print(f"{ticker}: {acc:.4f}")

print("\nF1 Scores:")
for ticker, f1 in f1_scores.items():
    print(f"{ticker}: {f1:.4f}")

# Print the Confusion Matrices
print("\nConfusion Matrices:")
for ticker, cm in confusion_matrices.items():
    print(f"{ticker}:\n{cm}")


Accuracy Rates:
AAPL: 0.4634
AMZN: 0.6098
GOOGL: 0.3659
META: 0.6098
MSFT: 0.5854
NFLX: 0.5610
NVDA: 0.6585
PYPL: 0.6341
TSLA: 0.5854

F1 Scores:
AAPL: 0.4453
AMZN: 0.5474
GOOGL: 0.3301
META: 0.5761
MSFT: 0.5469
NFLX: 0.5548
NVDA: 0.6502
PYPL: 0.6084
TSLA: 0.5854

Confusion Matrices:
AAPL:
[[14  8]
 [14  5]]
AMZN:
[[24  4]
 [12  1]]
GOOGL:
[[13  9]
 [17  2]]
META:
[[22  5]
 [11  3]]
MSFT:
[[21  5]
 [12  3]]
NFLX:
[[17  8]
 [10  6]]
NVDA:
[[22  6]
 [ 8  5]]
PYPL:
[[19  3]
 [12  7]]
TSLA:
[[12  9]
 [ 8 12]]


  actual_movements = returns.applymap(categorize_movement)


In [116]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score

# Assuming df and result_df are already defined
# Calculate daily returns for the tickers
returns = df.set_index('date').pct_change().dropna()

# Define the function to categorize the movement based on percentage change
def categorize_movement(x):
    if x >= 0:
        return 'up'
    elif x < 0:
        return 'down'

# Apply categorization to returns to create the actual movement DataFrame
actual_movements = returns.applymap(categorize_movement)

# Align the actual_movements DataFrame with the result_df by dropping the first and last rows
actual_movements = actual_movements.iloc[:-1]

# Initialize dictionaries to hold the results
sharpe_ratios = {}
confusion_matrices = {}
accuracy_scores = {}
f1_scores = {}

for ticker in returns.columns:
    # Calculate Sharpe Ratio
    mean_return = returns[ticker].mean()
    sd_return = returns[ticker].std()
    sharpe_ratio = mean_return / sd_return
    sharpe_ratios[ticker] = sharpe_ratio
    
    # Calculate Confusion Matrix
    y_true = actual_movements[ticker]
    y_pred = result_df[ticker]
    cm = confusion_matrix(y_true, y_pred, labels=['up', 'down'])
    confusion_matrices[ticker] = cm
    
    # Calculate Accuracy Rate
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, labels=['up', 'down'], average='weighted')

    accuracy_scores[ticker] = accuracy
    f1_scores[ticker] = f1

# Initialize dictionaries to hold the accuracy and F1 score

# Print the results
print("Accuracy Rates:")
for ticker, acc in accuracy_scores.items():
    print(f"{ticker}: {acc:.4f}")

print("\nF1 Scores:")
for ticker, f1 in f1_scores.items():
    print(f"{ticker}: {f1:.4f}")

# Print the Confusion Matrices
print("\nConfusion Matrices:")
for ticker, cm in confusion_matrices.items():
    print(f"{ticker}:\n{cm}")


Accuracy Rates:
AAPL: 0.4634
AMZN: 0.6098
GOOGL: 0.3659
META: 0.6098
MSFT: 0.5854
NFLX: 0.5610
NVDA: 0.6585
PYPL: 0.6341
TSLA: 0.5854

F1 Scores:
AAPL: 0.4453
AMZN: 0.5474
GOOGL: 0.3301
META: 0.5761
MSFT: 0.5469
NFLX: 0.5548
NVDA: 0.6502
PYPL: 0.6084
TSLA: 0.5854

Confusion Matrices:
AAPL:
[[14  8]
 [14  5]]
AMZN:
[[24  4]
 [12  1]]
GOOGL:
[[13  9]
 [17  2]]
META:
[[22  5]
 [11  3]]
MSFT:
[[21  5]
 [12  3]]
NFLX:
[[17  8]
 [10  6]]
NVDA:
[[22  6]
 [ 8  5]]
PYPL:
[[19  3]
 [12  7]]
TSLA:
[[12  9]
 [ 8 12]]


  actual_movements = returns.applymap(categorize_movement)


In [None]:
len(y_true)

In [None]:
len(y_pred)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming result_df is defined and has 9 tickers
tickers = result_df.columns  # Skip 'date' column if it's the first column

# Set up the matplotlib figure
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Distribution of Price Movements for Each Ticker')

# Flatten the axes array for easy iteration
axes_flat = axes.flatten()

for i, ticker in enumerate(tickers):
    # Count the occurrences of each category for the current ticker
    category_counts = result_df[ticker].value_counts()
    
    # Plot a bar chart for the current ticker
    sns.barplot(x=category_counts.index, y=category_counts.values, ax=axes_flat[i])
    axes_flat[i].set_title(ticker)
    axes_flat[i].set_ylabel('Count')
    axes_flat[i].set_xlabel('Category')

# Adjust layout for better readability
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

# Show the plot
plt.show()


In [None]:
# plot the price
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Price Movement for Each Ticker')

# Flatten the axes array for easy iteration
axes_flat = axes.flatten()

for i, ticker in enumerate(tickers):
    # Count the occurrences of each category for the current ticker
    category_counts = df[ticker]
    
    # Plot a bar chart for the current ticker
    sns.lineplot(x=df['date'], y=category_counts, ax=axes_flat[i])
    axes_flat[i].set_title(ticker)
    axes_flat[i].set_ylabel('Price')
    axes_flat[i].set_xlabel('Date')
    
# Adjust layout for better readability, make the vertical space between subplots larger
plt.tight_layout(rect=[0, 0.03, 1, 0.95], h_pad=4)

# adjust the x-axis label to make it more readable
for ax in axes_flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

# Show the plot
plt.show()

In [26]:
tickers

Index(['AAPL', 'GOOGL', 'AMZN', 'MSFT', 'TSLA', 'META', 'NVDA', 'PYPL',
       'NFLX'],
      dtype='object')

In [102]:
import yfinance as yf

# Fetch S&P 500 index data
sp500_data = yf.download('^GSPC', start=start_date, end=end_date)

# Resample S&P 500 data to weekly, assuming we align on Fridays
sp500_weekly_returns = sp500_data['Adj Close'].resample('W-FRI').last().pct_change().dropna()

# Function to calculate portfolio returns based on 'up'/'down'/'stable' signals
def portfolio_returns(returns, predictions):
    portfolio_returns = pd.Series(0, index=returns.index)
    for ticker in returns.columns:
        ticker_predictions = predictions[ticker].shift(-1)  # Adjusted as per previous fix
        weighted_returns = returns[ticker] * ticker_predictions.map({'up': 1, 'down': -1, 'stable': 0}).fillna(0)
        portfolio_returns += weighted_returns
    portfolio_returns /= len(returns.columns)
    return portfolio_returns

# Assuming df and result_df are already preprocessed and aligned correctly
# Convert the 'date' columns to datetime if they are not already
df['date'] = pd.to_datetime(df['date'])
result_df['date'] = pd.to_datetime(result_df['date'])

# Set 'date' as the index and calculate weekly returns for the stocks
weekly_returns = df.set_index('date').pct_change().dropna()

# Align and prepare the predictions in result_df
result_df.set_index('date', inplace=True)

# Calculate actual portfolio returns
actual_portfolio_returns = portfolio_returns(weekly_returns, result_df)

# Align the index of SP500 returns with the portfolio returns
sp500_weekly_returns = sp500_weekly_returns.reindex(actual_portfolio_returns.index).fillna(0)

# Assume a risk-free rate (annualized, convert to weekly)
risk_free_rate = 0.02 / 52

# Calculate Sharpe Ratio
portfolio_excess_returns_over_risk_free = actual_portfolio_returns - risk_free_rate
sharpe_ratio = portfolio_excess_returns_over_risk_free.mean() / portfolio_excess_returns_over_risk_free.std()

# Calculate Information Ratio
excess_returns_over_benchmark = actual_portfolio_returns - sp500_weekly_returns
information_ratio = excess_returns_over_benchmark.mean() / excess_returns_over_benchmark.std()

print(f"Sharpe Ratio: {sharpe_ratio}")
print(f"Information Ratio: {information_ratio}")

[*********************100%%**********************]  1 of 1 completed
Sharpe Ratio: 0.23607807214538062
Information Ratio: 0.25527200015020957


In [92]:
result_df['date'] = result_df.index
result_df.reset_index(drop=True, inplace=True)

In [97]:
import yfinance as yf

# Fetch S&P 500 index data
sp500_data = yf.download('^GSPC', start=start_date, end=end_date)

# Resample S&P 500 data to weekly, assuming we align on Fridays
sp500_weekly_returns = sp500_data['Adj Close'].resample('W-FRI').last().pct_change().dropna()

# Function to calculate portfolio returns based on 'up'/'down'/'stable' signals
def portfolio_returns(returns, predictions):
    portfolio_returns = pd.Series(0, index=returns.index)
    for ticker in returns.columns:
        ticker_predictions = predictions[ticker].shift(-1)  # Adjusted as per previous fix
        weighted_returns = returns[ticker] * ticker_predictions.map({'up': 1, 'down': -1, 'stable': 0}).fillna(0)
        portfolio_returns += weighted_returns
    portfolio_returns /= len(returns.columns)
    return portfolio_returns

# Assuming df and result_df are already preprocessed and aligned correctly
# Convert the 'date' columns to datetime if they are not already
df['date'] = pd.to_datetime(df['date'])
result_df['date'] = pd.to_datetime(result_df['date'])

# Set 'date' as the index and calculate weekly returns for the stocks
weekly_returns = df.set_index('date').pct_change().dropna()

# Align and prepare the predictions in result_df
result_df.set_index('date', inplace=True)

# Calculate actual portfolio returns
actual_portfolio_returns = portfolio_returns(weekly_returns, result_df)

# Align the index of SP500 returns with the portfolio returns
sp500_weekly_returns = sp500_weekly_returns.reindex(actual_portfolio_returns.index).fillna(0)

# Assume a risk-free rate (annualized, convert to weekly)
risk_free_rate = 0.02 / 52

# Calculate Sharpe Ratio
portfolio_excess_returns_over_risk_free = actual_portfolio_returns - risk_free_rate
sharpe_ratio = portfolio_excess_returns_over_risk_free.mean() / portfolio_excess_returns_over_risk_free.std()

# Calculate Information Ratio
excess_returns_over_benchmark = actual_portfolio_returns - sp500_weekly_returns
information_ratio = excess_returns_over_benchmark.mean() / excess_returns_over_benchmark.std()

print(f"Sharpe Ratio: {sharpe_ratio}")
print(f"Information Ratio: {information_ratio}")

[*********************100%%**********************]  1 of 1 completed
Sharpe Ratio: 0.2920021997986978
Information Ratio: 0.30903635418517006


In [31]:
raw_data = yf.download (tickers = "^GSPC", start = "1994-01-07", 
                              end = "2019-09-01", interval = "1d")

[*********************100%%**********************]  1 of 1 completed


In [32]:
raw_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1994-01-07,467.089996,470.260010,467.029999,469.899994,469.899994,324920000
1994-01-10,469.899994,475.269989,469.549988,475.269989,475.269989,319490000
1994-01-11,475.269989,475.279999,473.269989,474.130005,474.130005,305490000
1994-01-12,474.130005,475.059998,472.140015,474.170013,474.170013,310690000
1994-01-13,474.170013,474.170013,471.799988,472.470001,472.470001,277970000
...,...,...,...,...,...,...
2019-08-26,2866.699951,2879.270020,2856.000000,2878.379883,2878.379883,2859790000
2019-08-27,2893.139893,2898.790039,2860.590088,2869.159912,2869.159912,3537490000
2019-08-28,2861.280029,2890.030029,2853.050049,2887.939941,2887.939941,3102480000
2019-08-29,2910.370117,2930.500000,2905.669922,2924.580078,2924.580078,3177150000
