In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


In [11]:
def split_train_test(df, time_steps=30):
    # Clean and convert financial columns to numeric
    def clean_column(col):
        return df[col].replace({'$': '', ',': ''}, regex=True).astype(float)

    df['Close/Last'] = clean_column('Close/Last')

    # Ensure the index is sorted
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()


In [12]:
 # Add moving averages and volatility
df['MA_10'] = df['Close/Last'].rolling(window=10).mean()
df['Volatility_10'] = df['Close/Last'].rolling(window=10).std()

# Drop NaN values after calculating features
df.dropna(inplace=True)

    # Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(df[['Close/Last', 'MA_10', 'Volatility_10']])


DataError: No numeric types to aggregate

In [7]:
 # Create sequences for LSTM
    def create_sequences(data, time_steps):
        X, y = [], []
        for i in range(len(data) - time_steps):
            X.append(data[i:i + time_steps, :-1])  # Use all features except the target
            y.append(data[i + time_steps, 0])  # Target is the first column (Close/Last)
        return np.array(X), np.array(y)

    X, y = create_sequences(data_scaled, time_steps)

    # Split data into training and testing sets
    split = int(len(X) * 0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    return X_train, X_test, y_train, y_test, scaler

IndentationError: unexpected indent (2166012567.py, line 2)

In [None]:
def train_model(X_train, y_train, time_steps=30, epochs=20, batch_size=32):
    # Build the LSTM model
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(time_steps, X_train.shape[2])),
        LSTM(50, return_sequences=False),
        Dense(25, activation='relu'),
        Dense(1)
    ])

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)

    return model

In [None]:
def predict_and_evaluate(model, scaler, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_rescaled = scaler.inverse_transform(np.concatenate((y_pred, np.zeros((y_pred.shape[0], 2))), axis=1))[:, 0]
    y_test_rescaled = scaler.inverse_transform(np.concatenate((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 2))), axis=1))[:, 0]

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))
    print(f"RMSE: {rmse}")

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(y_test_rescaled)), y_test_rescaled, label="Actual")
    plt.plot(range(len(y_pred_rescaled)), y_pred_rescaled, label="Predicted")
    plt.title("Actual vs Predicted Prices")
    plt.legend()
    plt.show()

    return rmse

In [None]:
# Carregar a base de dados para pré processar:
df = pd.read_csv('C:/Users/biel_/OneDrive/Documentos/Faculdade/Fase 3/amazon_stock_ml/HistoricalData_1731547025648.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test, scaler = split_train_test(df, time_steps=30)

# Train the model
model = train_model(X_train, y_train, time_steps=45, epochs=40, batch_size=32)

# Predict and evaluate
rmse = predict_and_evaluate(model, scaler, X_test, y_test)