In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_stock_data(start_date, end_date, initial_price, num_records):
    dates = pd.date_range(start=start_date, end=end_date, freq='B')
    price = [initial_price]
    for _ in range(1, len(dates)):
        new_price = price[-1] + random.uniform(-1, 1)
        price.append(new_price)
    data = {'Date': dates, 'Price': price}
    df = pd.DataFrame(data)
    return df

# Example usage
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
initial_price = 100.0
num_records = 252  # Number of business days in a year (assuming no holidays)
stock_data = generate_stock_data(start_date, end_date, initial_price, num_records)

# Print the generated stock data
print(stock_data)


          Date       Price
0   2023-01-02  100.000000
1   2023-01-03  100.147438
2   2023-01-04  100.759346
3   2023-01-05  101.051248
4   2023-01-06  100.510627
..         ...         ...
255 2023-12-25  101.455499
256 2023-12-26  101.221032
257 2023-12-27  100.462950
258 2023-12-28   99.783508
259 2023-12-29  100.420023

[260 rows x 2 columns]


In [7]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_index_data(start_date, end_date, initial_price, num_records):
    dates = pd.date_range(start=start_date, end=end_date, freq='B')
    price = [initial_price]
    for _ in range(1, len(dates)):
        new_price = price[-1] + random.uniform(-1, 1)
        price.append(new_price)
    price = price[:len(dates)]  # Truncate the price list to match the length of dates
    data = {'Date': dates, 'Price': price}
    df = pd.DataFrame(data)
    return df
"""
def generate_stock_data(start_date, end_date, initial_price, num_records):
    dates = pd.date_range(start=start_date, end=end_date, freq='B')
    price = [initial_price]
    for _ in range(1, len(dates)):
        new_price = price[-1] + random.uniform(-1, 1)
        price.append(new_price)
    data = {'Date': dates, 'Price': price}
    df = pd.DataFrame(data)
    return df
"""

# Example usage
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
initial_price = 1000.0
num_records = 252  # Number of business days in a year (assuming no holidays)

index1_data = generate_index_data(start_date, end_date, initial_price, num_records)
index2_data = generate_index_data(start_date, end_date, initial_price, num_records)
index3_data = generate_index_data(start_date, end_date, initial_price, num_records)

# Print the generated index price data
print("Index 1:")
print(index1_data.head())

print("\nIndex 2:")
print(index2_data.head())

print("\nIndex 3:")
print(index3_data.head())


Index 1:
        Date        Price
0 2023-01-02  1000.000000
1 2023-01-03  1000.893872
2 2023-01-04  1000.943365
3 2023-01-05  1001.809529
4 2023-01-06  1001.765526

Index 2:
        Date        Price
0 2023-01-02  1000.000000
1 2023-01-03  1000.794060
2 2023-01-04  1001.297395
3 2023-01-05  1001.953689
4 2023-01-06  1002.936485

Index 3:
        Date        Price
0 2023-01-02  1000.000000
1 2023-01-03  1000.304409
2 2023-01-04  1000.376408
3 2023-01-05   999.483913
4 2023-01-06  1000.378147


In [10]:
def generate_macroeconomic_data(start_date, end_date, initial_value, num_records):
    dates = pd.date_range(start=start_date, end=end_date, freq='B')
    data = {'Date': dates}
    for series_name in ['Inflation', 'Unemployment']:
        series_data = [initial_value]
        for _ in range(1, len(dates)):
            new_value = series_data[-1] + random.uniform(-0.5, 0.5)
            series_data.append(new_value)
        series_data = series_data[:len(dates)]  # Truncate the series data to match the length of dates
        data[series_name] = series_data
    df = pd.DataFrame(data)
    return df

# Example usage
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
initial_inflation = 2.0
initial_unemployment = 5.0
num_records = 252  # Number of business days in a year (assuming no holidays)

macro_data = generate_macroeconomic_data(start_date, end_date, initial_inflation, num_records)

# Print the generated macroeconomic data
print(macro_data)


          Date  Inflation  Unemployment
0   2023-01-02   2.000000      2.000000
1   2023-01-03   1.739657      2.034125
2   2023-01-04   1.379861      1.869969
3   2023-01-05   1.166498      2.101371
4   2023-01-06   1.424351      2.521133
..         ...        ...           ...
255 2023-12-25  12.519147      0.455612
256 2023-12-26  12.286897      0.828066
257 2023-12-27  12.200301      1.322435
258 2023-12-28  12.276019      1.507405
259 2023-12-29  12.442782      1.729736

[260 rows x 3 columns]


In [24]:
import numpy as np
import pandas as pd

def generate_sentiment_probabilities(start_date, end_date, num_records):
    dates = pd.date_range(start=start_date, end=end_date, freq='B')
    num_samples = len(dates)
    data = {'Date': dates}
    for sentiment in ['Negative', 'Positive', 'Neutral']:
        if sentiment == 'Neutral':
            sentiment_data = np.random.uniform(0.4, 0.6, size=num_samples)
        else:
            dominant_prob = np.random.uniform(0.6, 0.7, size=num_samples)
            non_dominant_prob = (1 - dominant_prob) / 2
            if sentiment == 'Negative':
                sentiment_data = np.column_stack((dominant_prob, non_dominant_prob, non_dominant_prob)).flatten()
            else:
                sentiment_data = np.column_stack((non_dominant_prob, dominant_prob, non_dominant_prob)).flatten()
        data[sentiment] = sentiment_data[:num_samples]
    
    df = pd.DataFrame(data)
    return df

# Example usage
start_date = pd.to_datetime('2023-01-01')
end_date = pd.to_datetime('2023-12-31')
num_records = 252

sentiment_data = generate_sentiment_probabilities(start_date, end_date, num_records)

# Print the generated sentiment data
print(sentiment_data)


          Date  Negative  Positive   Neutral
0   2023-01-02  0.620936  0.163480  0.404099
1   2023-01-03  0.189532  0.673039  0.550648
2   2023-01-04  0.189532  0.163480  0.535775
3   2023-01-05  0.609956  0.188311  0.503220
4   2023-01-06  0.195022  0.623377  0.430421
..         ...       ...       ...       ...
255 2023-12-25  0.665339  0.179876  0.410703
256 2023-12-26  0.167331  0.640248  0.429177
257 2023-12-27  0.167331  0.179876  0.513836
258 2023-12-28  0.657253  0.155771  0.569793
259 2023-12-29  0.171374  0.688457  0.470962

[260 rows x 4 columns]


In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Merge the datasets based on the date column
merged_data = pd.merge(index1_data, index2_data, on='Date')
merged_data = pd.merge(merged_data, index3_data, on='Date')
merged_data = pd.merge(merged_data, macro_data, on='Date')
merged_data = pd.merge(merged_data, sentiment_data, on='Date')

# Split the dataset into training and testing sets
train_size = int(len(merged_data) * 0.8)
train_data = merged_data[:train_size]
test_data = merged_data[train_size:]



In [29]:
merged_data

Unnamed: 0,Date,Price_x,Price_y,Price,Inflation,Unemployment,Negative,Positive,Neutral
0,2023-01-02,1000.000000,1000.000000,1000.000000,2.000000,2.000000,0.620936,0.163480,0.404099
1,2023-01-03,1000.893872,1000.794060,1000.304409,1.739657,2.034125,0.189532,0.673039,0.550648
2,2023-01-04,1000.943365,1001.297395,1000.376408,1.379861,1.869969,0.189532,0.163480,0.535775
3,2023-01-05,1001.809529,1001.953689,999.483913,1.166498,2.101371,0.609956,0.188311,0.503220
4,2023-01-06,1001.765526,1002.936485,1000.378147,1.424351,2.521133,0.195022,0.623377,0.430421
...,...,...,...,...,...,...,...,...,...
255,2023-12-25,991.444547,1000.642415,994.417429,12.519147,0.455612,0.665339,0.179876,0.410703
256,2023-12-26,991.769384,1001.043238,995.392737,12.286897,0.828066,0.167331,0.640248,0.429177
257,2023-12-27,991.158673,1000.067359,995.522219,12.200301,1.322435,0.167331,0.179876,0.513836
258,2023-12-28,991.145991,1000.373667,994.591070,12.276019,1.507405,0.657253,0.155771,0.569793


In [34]:
from sklearn.preprocessing import MinMaxScaler

# Create a list of the numeric features
numeric_features = ['Price_x', 'Price_y', 'Price', 'Inflation', 'Unemployment', 'Negative', 'Positive', 'Neutral']

# Convert object features to numeric
train_data[numeric_features] = train_data[numeric_features].apply(pd.to_numeric)
test_data[numeric_features] = test_data[numeric_features].apply(pd.to_numeric)

# Preprocess the data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data[numeric_features])
test_data_scaled = scaler.transform(test_data[numeric_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[numeric_features] = train_data[numeric_features].apply(pd.to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[numeric_features] = test_data[numeric_features].apply(pd.to_numeric)


In [35]:
train_data_scaled

array([[5.08310024e-01, 6.44080316e-02, 9.17613396e-01, ...,
        8.60608418e-01, 2.20133954e-02, 8.41159121e-06],
       [5.77366188e-01, 1.49054913e-01, 9.45104843e-01, ...,
        6.96957908e-02, 9.55973209e-01, 7.51369023e-01],
       [5.81189781e-01, 2.02710448e-01, 9.51607151e-01, ...,
        6.96957908e-02, 2.20133954e-02, 6.75114564e-01],
       ...,
       [4.85308085e-01, 3.31431195e-01, 3.02516006e-01, ...,
        2.55053493e-02, 8.55428111e-01, 7.13579691e-02],
       [5.03312256e-01, 2.69720851e-01, 2.54809639e-01, ...,
        2.55053493e-02, 7.22859443e-02, 7.50888105e-01],
       [4.42045016e-01, 2.39085879e-01, 2.36055288e-01, ...,
        9.66580827e-01, 8.59041878e-02, 4.85717617e-01]])

In [46]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

train_labels = train_data_scaled[:, 3]  # Index 3 corresponds to the 'Price' column
test_labels = test_data_scaled[:, 3]  # Index 3 corresponds to the 'Price' column

# Define the number of time steps and features
timesteps = 260
num_features = 9

# Reshape the input data
train_data_reshaped = train_data_scaled.reshape(timesteps, num_features)
test_data_reshaped = test_data_scaled.reshape(timesteps, num_features)

# Define the model architecture
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(timesteps, num_features)))
model.add(LSTM(units=50))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(train_data_reshaped, train_labels, epochs=10, batch_size=32)

# Evaluate the model
loss = model.evaluate(test_data_reshaped, test_labels)

# Make predictions
predictions = model.predict(test_data_reshaped)


ValueError: cannot reshape array of size 1664 into shape (260,9)

In [45]:
print(train_data_scaled.shape)



(208, 8)
