In [1]:
pip install pykeen scikit-learn torch pandas


Collecting pykeen
  Downloading pykeen-1.11.0-py3-none-any.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting tabulate (from pykeen)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.6.1-py3-none-any.whl.metadata (17 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.4-py3-none-any.whl.metadata (13 kB)
Collecting class-resolver>=0.5.1 (from pykeen)
  Downloading class_re

In [1]:
import torch
import json
import numpy as np
from pykeen.models import TransE
from pykeen.losses import SoftplusLoss
from pykeen.optimizers import Adam
from pykeen.training import TrainingLoop
from pykeen.triples import TriplesFactory
import os

# Load entities, relations, and mapping
def load_entities(file_path):
    entities = []
    with open(file_path, 'r') as file:
        for line in file.readlines():
            entity = json.loads(line.strip())
            entities.append(entity['id'])
    return entities

def load_relations(file_path):
    relations = []
    with open(file_path, 'r') as file:
        for line in file.readlines():
            relation = json.loads(line.strip())
            relations.append((relation['source'], relation['type'], relation['dest']))
    return relations

def load_values(file_path):
    values = []
    with open(file_path, 'r') as file:
        for line in file.readlines():
            value = json.loads(line.strip())
            values.append((value['id'], value['properties'].get('amount', None), value['properties'].get('value', None)))
    return values

def load_mapping(file_path):
    mapping = {}
    with open(file_path, 'r') as file:
        for line in file.readlines():
            stock, entity_id = line.strip().split(':')
            mapping[stock] = entity_id
    return mapping

# Folder path
folder_path = 'WikiGraph'

# Load data
entities = load_entities(os.path.join(folder_path, 'entity.txt'))
relations = load_relations(os.path.join(folder_path, 'relation.txt'))
values = load_values(os.path.join(folder_path, 'Values.txt'))
mapping = load_mapping(os.path.join(folder_path, 'mapping.txt'))

# Create ID mappings (Entities as strings and relations as strings)
entity_to_id = {entity: idx for idx, entity in enumerate(entities)}
relation_to_id = {f'P{idx}': idx for idx in range(len(relations))}  # Adjust as needed based on relation identifiers

# Prepare triples
triples = []

# Add relations to triples (Ensure relations are mapped correctly as strings)
for relation in relations:
    source_id, relation_type, dest_id = relation
    if source_id in entity_to_id and dest_id in entity_to_id:
        # Ensure that relation_type is in the relation_to_id mapping
        relation_id = relation_to_id.get(relation_type, None)
        if relation_id is not None:
            triples.append((entity_to_id[source_id], relation_id, entity_to_id[dest_id]))

# Add values to triples
for value in values:
    value_id, amount, date = value
    if value_id in entity_to_id:
        # Here, 'has_value' is a placeholder for the relation, adjust as needed
        relation_id = relation_to_id.get('has_value', None)
        if relation_id is not None:
            triples.append((value_id, relation_id, amount))  # Custom relation for value

# Convert triples to tensor
triples_tensor = torch.tensor(triples, dtype=torch.long)

# Manually create the TriplesFactory with the correct mappings
triples_factory = TriplesFactory(
    num_entities=len(entity_to_id),
    num_relations=len(relation_to_id),
    triples=triples_tensor,
)

# Initialize TransE model
model = TransE(
    triples_factory=triples_factory,
    embedding_dim=100  # Adjust embedding dimension as needed
)

# Set up optimizer and loss function
optimizer = Adam(model.parameters())
loss = SoftplusLoss()

# Training loop
training_loop = TrainingLoop(
    model=model,
    optimizer=optimizer,
    loss=loss,
    triples_factory=triples_factory,
    num_epochs=100,
    batch_size=512,  # Adjust batch size as needed
)

# Train the model
training_loop.train()

# Extract entity and relation embeddings
entity_embeddings = model.entity_representations[0].detach().cpu().numpy()
relation_embeddings = model.relation_representations[0].detach().cpu().numpy()

# Print embeddings
print("Entity Embeddings:", entity_embeddings[:5])
print("Relation Embeddings:", relation_embeddings[:5])


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fbf22789c60>>
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


ModuleNotFoundError: No module named 'pykeen'

# Random Forest with Embeddings

## Base TIs Modal using WikiData 


In [None]:
# Define DCG and nDCG functions
def dcg(scores, k):
    scores = scores[:k]
    return np.sum([score / np.log2(idx + 2) for idx, score in enumerate(scores)])

def ndcg(y_true, y_pred, k=10):
    # Sort predictions and true values by predicted scores
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = np.array(y_true)[sorted_indices]
    
    # Compute DCG and IDCG
    actual_dcg = dcg(y_true_sorted, k)
    ideal_dcg = dcg(sorted(y_true, reverse=True), k)
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

# Load and prepare dataset
# Assuming `stock_data` is loaded as a pandas DataFrame
# Example: stock_data = pd.read_csv('your_stock_data.csv')
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# Feature engineering
X = stock_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21','entity_embeddings','Relation Embeddings:']]
y = stock_data['Close']

# Filter dates for training and testing
train_start_date = '2019-01-01'
train_end_date = '2022-05-31'
test_start_date = '2022-06-01'
test_end_date = '2023-12-31'

train_data = stock_data[(stock_data['Date'] >= train_start_date) & (stock_data['Date'] <= train_end_date)]
test_data = stock_data[(stock_data['Date'] >= test_start_date) & (stock_data['Date'] <= test_end_date)]

# Check for missing values and remove them
train_data = train_data.dropna()
test_data = test_data.dropna()

# Normalize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(train_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21','entity_embeddings','Relation Embeddings:']])
y_train = train_data['Close'].values
X_test = scaler.transform(test_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21','entity_embeddings','Relation Embeddings:']])
y_test = test_data['Close'].values

# Hyperparameter tuning using RandomizedSearchCV (faster than GridSearchCV)
param_distributions = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Best model after tuning
rf_model = random_search.best_estimator_
rf_model.fit(X_train, y_train)

# Predict on test data
predictions = rf_model.predict(X_test)

# Recompute Metrics
rmse = np.sqrt(mean_squared_error(y_test, predictions))
ndcg_score = ndcg(y_test, predictions, k=10)

# Output Results
print(f"nDCG@10: {ndcg_score}")
print(f"RMSE: {rmse}")

# Plot predicted vs actual for a small range of dates
date_range = test_data['Date'][:50]  # Select first 50 dates
plt.figure(figsize=(12, 6))
plt.plot(date_range, y_test[:50], label='Actual', marker='o')
plt.plot(date_range, predictions[:50], label='Predicted', marker='x')
plt.title('Predicted vs Actual Stock Prices')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.legend()
plt.grid()
plt.show()

## Advance TIs Modal using WikiData 

In [None]:
# Define DCG and nDCG functions
def dcg(scores, k):
    scores = scores[:k]
    return np.sum([score / np.log2(idx + 2) for idx, score in enumerate(scores)])

def ndcg(y_true, y_pred, k=10):
    # Sort predictions and true values by predicted scores
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = np.array(y_true)[sorted_indices]
    
    # Compute DCG and IDCG
    actual_dcg = dcg(y_true_sorted, k)
    ideal_dcg = dcg(sorted(y_true, reverse=True), k)
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

# Load and prepare dataset
# Assuming `stock_data` is loaded as a pandas DataFrame
# Example: stock_data = pd.read_csv('your_stock_data.csv')
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# Feature engineering
X = stock_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21', 'Volatility', 'Close_Lag1', 'Close_Lag2', 'Return','entity_embeddings','Relation Embeddings:']]
y = stock_data['Close']

# Filter dates for training and testing
train_start_date = '2019-01-01'
train_end_date = '2022-05-31'
test_start_date = '2022-06-01'
test_end_date = '2023-12-31'

train_data = stock_data[(stock_data['Date'] >= train_start_date) & (stock_data['Date'] <= train_end_date)]
test_data = stock_data[(stock_data['Date'] >= test_start_date) & (stock_data['Date'] <= test_end_date)]

# Check for missing values and remove them
train_data = train_data.dropna()
test_data = test_data.dropna()

# Normalize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(train_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21', 'Volatility', 'Close_Lag1', 'Close_Lag2', 'Return','entity_embeddings','Relation Embeddings:']])
y_train = train_data['Close'].values
X_test = scaler.transform(test_data[['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21', 'Volatility', 'Close_Lag1', 'Close_Lag2', 'Return','entity_embeddings','Relation Embeddings:']])
y_test = test_data['Close'].values

# Hyperparameter tuning using RandomizedSearchCV (faster than GridSearchCV)
param_distributions = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Best model after tuning
rf_model = random_search.best_estimator_
rf_model.fit(X_train, y_train)

# Predict on test data
predictions = rf_model.predict(X_test)

# Recompute Metrics
rmse = np.sqrt(mean_squared_error(y_test, predictions))
ndcg_score = ndcg(y_test, predictions, k=10)

# Output Results
print(f"nDCG@10: {ndcg_score}")
print(f"RMSE: {rmse}")

# Plot predicted vs actual for a small range of dates
date_range = test_data['Date'][:50]  # Select first 50 dates
plt.figure(figsize=(12, 6))
plt.plot(date_range, y_test[:50], label='Actual', marker='o')
plt.plot(date_range, predictions[:50], label='Predicted', marker='x')
plt.title('Predicted vs Actual Stock Prices')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.legend()
plt.grid()
plt.show()

# LSTM with Embeddings

## Base TIs Modal using WikiData 

In [None]:
# Define features and target
features = ['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21','entity_embeddings','Relation Embeddings:']
target = 'Close'

# Cap outliers in 'Return' to a reasonable range
low, high = stock_data['Return'].quantile(0.01), stock_data['Return'].quantile(0.99)
stock_data['Return'] = stock_data['Return'].clip(lower=low, upper=high)

# Split into training and testing sets
train_data = stock_data[(stock_data['Date'] >= '2019-01-01') & (stock_data['Date'] <= '2022-05-31')]
test_data = stock_data[(stock_data['Date'] >= '2022-06-01') & (stock_data['Date'] <= '2023-12-31')]

# Define separate scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Fit scalers on training data
X_train_scaled = scaler_X.fit_transform(train_data[features])
y_train_scaled = scaler_y.fit_transform(train_data[[target]])

# Transform test data using the same scalers
X_test_scaled = scaler_X.transform(test_data[features])
y_test_scaled = scaler_y.transform(test_data[[target]])

# Convert to 3D arrays for LSTM
time_steps = 10
X_train_lstm, y_train_lstm = [], []
for i in range(len(X_train_scaled) - time_steps):
    X_train_lstm.append(X_train_scaled[i:i+time_steps])
    y_train_lstm.append(y_train_scaled[i+time_steps])
X_train_lstm, y_train_lstm = np.array(X_train_lstm), np.array(y_train_lstm)

X_test_lstm, y_test_lstm = [], []
for i in range(len(X_test_scaled) - time_steps):
    X_test_lstm.append(X_test_scaled[i:i+time_steps])
    y_test_lstm.append(y_test_scaled[i+time_steps])
X_test_lstm, y_test_lstm = np.array(X_test_lstm), np.array(y_test_lstm)

# Debugging: Check shapes
print(f"Training data shape: {X_train_lstm.shape}, Training target shape: {y_train_lstm.shape}")
print(f"Testing data shape: {X_test_lstm.shape}, Testing target shape: {y_test_lstm.shape}")

# Build the LSTM model
model = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(time_steps, X_train_lstm.shape[2])),
    LSTM(32, activation='relu'),
    Dense(1)
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(
    X_train_lstm, y_train_lstm,
    validation_data=(X_test_lstm, y_test_lstm),
    epochs=20,
    batch_size=128,
    verbose=1
)

# Plot the training and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Evaluate the model on the test set
loss = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f'Test Loss: {loss}')

# Make predictions using the LSTM model
predictions_scaled = model.predict(X_test_lstm)

# Inverse transform predictions and actual values
predictions = scaler_y.inverse_transform(predictions_scaled)
y_test_original = scaler_y.inverse_transform(y_test_lstm)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_original, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate nDCG@10
def ndcg_at_k(actual, predicted, k=10):
    actual_sorted = sorted(actual, reverse=True)[:k]
    dcg = sum([pred / np.log2(idx + 2) for idx, pred in enumerate(predicted[:k])])
    idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(actual_sorted)])
    return dcg / idcg if idcg > 0 else 0

actual_top_10 = y_test_original[:10].flatten()
predicted_top_10 = predictions[:10].flatten()
ndcg = ndcg_at_k(actual_top_10, predicted_top_10, k=10)
print(f'nDCG@10: {ndcg}')

# Plot predictions vs actuals for a small range
plt.figure(figsize=(10, 6))
plt.plot(range(100), y_test_original[:100], label='Actual', color='blue')
plt.plot(range(100), predictions[:100], label='Predicted', color='red', linestyle='--')
plt.xlabel('Sample Index')
plt.ylabel('Stock Price')
plt.title('Actual vs Predicted Stock Prices (Small Range)')
plt.legend()
plt.show()

## Advance TIs Modal using WikiData 

In [None]:
# Define features and target
features = ['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21', 'Return', 'Volatility', 'Close_Lag1', 'Close_Lag2','entity_embeddings','Relation Embeddings:']
target = 'Close'

# Cap outliers in 'Return' to a reasonable range
low, high = stock_data['Return'].quantile(0.01), stock_data['Return'].quantile(0.99)
stock_data['Return'] = stock_data['Return'].clip(lower=low, upper=high)

# Split into training and testing sets
train_data = stock_data[(stock_data['Date'] >= '2019-01-01') & (stock_data['Date'] <= '2022-05-31')]
test_data = stock_data[(stock_data['Date'] >= '2022-06-01') & (stock_data['Date'] <= '2023-12-31')]

# Select features for prediction
features = ['Open', 'High', 'Low', 'Volume', 'MA7', 'MA21','Volatility', 'Close_Lag1', 'Close_Lag2', 'Return']
target = 'Close'

# Define separate scalers for features and target
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Fit scalers on training data
X_train_scaled = scaler_X.fit_transform(train_data[features])
y_train_scaled = scaler_y.fit_transform(train_data[[target]])

# Transform test data using the same scalers
X_test_scaled = scaler_X.transform(test_data[features])
y_test_scaled = scaler_y.transform(test_data[[target]])

# Convert to 3D arrays for LSTM
time_steps = 10
X_train_lstm, y_train_lstm = [], []
for i in range(len(X_train_scaled) - time_steps):
    X_train_lstm.append(X_train_scaled[i:i+time_steps])
    y_train_lstm.append(y_train_scaled[i+time_steps])
X_train_lstm, y_train_lstm = np.array(X_train_lstm), np.array(y_train_lstm)

X_test_lstm, y_test_lstm = [], []
for i in range(len(X_test_scaled) - time_steps):
    X_test_lstm.append(X_test_scaled[i:i+time_steps])
    y_test_lstm.append(y_test_scaled[i+time_steps])
X_test_lstm, y_test_lstm = np.array(X_test_lstm), np.array(y_test_lstm)

# Debugging: Check shapes
print(f"Training data shape: {X_train_lstm.shape}, Training target shape: {y_train_lstm.shape}")
print(f"Testing data shape: {X_test_lstm.shape}, Testing target shape: {y_test_lstm.shape}")

# Build the LSTM model
model = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(time_steps, X_train_lstm.shape[2])),
    LSTM(32, activation='relu'),
    Dense(1)
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(
    X_train_lstm, y_train_lstm,
    validation_data=(X_test_lstm, y_test_lstm),
    epochs=20,
    batch_size=128,
    verbose=1
)

# Plot the training and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Evaluate the model on the test set
loss = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f'Test Loss: {loss}')

# Make predictions using the LSTM model
predictions_scaled = model.predict(X_test_lstm)

# Inverse transform predictions and actual values
predictions = scaler_y.inverse_transform(predictions_scaled)
y_test_original = scaler_y.inverse_transform(y_test_lstm)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_original, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate nDCG@10
def ndcg_at_k(actual, predicted, k=10):
    actual_sorted = sorted(actual, reverse=True)[:k]
    dcg = sum([pred / np.log2(idx + 2) for idx, pred in enumerate(predicted[:k])])
    idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(actual_sorted)])
    return dcg / idcg if idcg > 0 else 0

actual_top_10 = y_test_original[:10].flatten()
predicted_top_10 = predictions[:10].flatten()
ndcg = ndcg_at_k(actual_top_10, predicted_top_10, k=10)
print(f'nDCG@10: {ndcg}')

# Plot predictions vs actuals for a small range
plt.figure(figsize=(10, 6))
plt.plot(range(100), y_test_original[:100], label='Actual', color='blue')
plt.plot(range(100), predictions[:100], label='Predicted', color='red', linestyle='--')
plt.xlabel('Sample Index')
plt.ylabel('Stock Price')
plt.title('Actual vs Predicted Stock Prices (Small Range)')
plt.legend()
plt.show()
