In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [14]:
def load_data(filepath):
    data = pd.read_csv(filepath)
    data['Date'] = pd.to_datetime(data['Date'])
    return data

def create_sliding_window(data, window_size=5):
    X, y, dates = [], [], []
    for i in range(len(data) - window_size):
        X.append(data[i:(i + window_size), 1])
        y.append(data[i + window_size, 1])
        dates.append(data[i + window_size, 0])
    return np.array(X), np.array(y), np.array(dates)

def prepare_data(data, train_ratio=0.8):
    data = data[['Date', 'Close']]
    data['Close'] = data['Close'].shift(-1)
    data.dropna(inplace=True)
    values = data.values
    X, y, dates = create_sliding_window(values)
    
    dates = pd.to_datetime(dates)
    total_points = len(dates)
    split_index = int(total_points * train_ratio)
    split_date = dates[split_index]
    
    train_mask = dates <= split_date
    test_mask = dates > split_date
    
    X_train, y_train = X[train_mask], y[train_mask]
    X_test, y_test = X[test_mask], y[test_mask]
    return (X_train, X_test, y_train, y_test), dates[train_mask], dates[test_mask]

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def predict(model, X_test):
    return model.predict(X_test)

def evaluate(predictions, y_test):
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse, rmse

def visualize(train_dates, y_train, test_dates, y_test, predictions, company_name):
    # make sure the directory exists
    plots_dir = '/Users/lisirui/Desktop/FPP/Stock_prediction' # path to save plots
    company_dir = f'{plots_dir}/{company_name}'
    os.makedirs(company_dir, exist_ok=True)

    # sort dates
    full_dates = np.concatenate([train_dates, test_dates])
    full_actuals = np.concatenate([y_train, y_test])
    sorted_indices = np.argsort(full_dates)

    # visualize and save each plot separately
    for name, preds in predictions.items():
        plt.figure(figsize=(10, 6))
        plt.plot(full_dates[sorted_indices], full_actuals[sorted_indices], label='Actual Prices', color='blue')
        
        extended_preds = np.empty_like(full_actuals)
        extended_preds[:] = np.nan
        extended_preds[-len(preds):] = preds
        
        plt.plot(full_dates[sorted_indices], extended_preds[sorted_indices], label=f'{name} Predicted Prices', color='yellow')
        plt.title(f'{name} Predictions vs Actual')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.xticks(rotation=45)
        plt.legend()
        plt.grid(True)
        plt.savefig(f'{company_dir}/{name.replace(" ", "_").lower()}_predictions.png')
        plt.close()
    
    return company_dir

In [3]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=1000, random_state=42),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, seed=16)
}

In [12]:
data_dir = '/Users/lisirui/Desktop/FPP/Data/First_Solar/first_solar_stock_data.csv' # your stock data path
data = load_data(data_dir)
(X_train, X_test, y_train, y_test), train_dates, test_dates = prepare_data(data)

In [13]:
predictions = {}
evaluations = {}

for name, model in models.items():
    trained_model = train_model(model, X_train, y_train)
    preds = predict(trained_model, X_test)
    
    mse, rmse = evaluate(preds, y_test)
    print(f"{name} MSE: {mse:.4f}, RMSE: {rmse:.4f}")
    predictions[name] = preds
    evaluations[name] = [mse, rmse]

Linear Regression MSE: 45.9654, RMSE: 6.7798
Decision Tree MSE: 822.7664, RMSE: 28.6839
Random Forest MSE: 658.9817, RMSE: 25.6706
XGBoost MSE: 751.0569, RMSE: 27.4054


In [10]:
company_name = 'First_Solar' # your company name
company_dir = visualize(train_dates, y_train, test_dates, y_test, predictions, company_name)

# save evaluations to json file
evaluations_df = pd.DataFrame(evaluations, index=['MSE', 'RMSE'])
evaluations_df.to_json(f'{company_dir}/evaluations.json')