# **Machine Learning LSTM**
___

### **Load data**

In [None]:
import pandas as pd
import numpy as np


data_path = "../data/yahoo_massive_stock_data_2018-2023.csv"

df = pd.read_csv(data_path)

# display the first 5 rows
print(df.head())

# display data types of each column
print(df.info())


In [None]:
# display the count of missing values in each column
print("is null sum:")
print(df.isnull().sum())

In [None]:
# display the count of duplicate rows
print("duplicated sum:",df.duplicated().sum())

### **Preprocessing data**

In [None]:
""" example csv dataset
Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
2018-11-29 00:00:00-05:00,43.829760572993,43.8633538041636,42.6395935832266,43.0835075378418,167080000,0,0,AAPL
2018-11-29 00:00:00-05:00,104.769074332185,105.519257086357,103.534594914971,104.636131286621,28123200,0,0,MSFT
2018-11-29 00:00:00-05:00,54.1764984130859,55.0074996948242,54.0999984741211,54.7290000915527,31004000,0,0,GOOGL
2018-11-29 00:00:00-05:00,83.7494964599609,84.4994964599609,82.6165008544922,83.6784973144531,132264000,0,0,AMZN
2018-11-29 00:00:00-05:00,39.6927840259795,40.0649038762231,38.7351954599368,39.0378532409668,54917200,0.04,0,NVDA
2018-11-29 00:00:00-05:00,135.919998168945,139.990005493164,135.660003662109,138.679992675781,24238700,0,0,META
2018-11-29 00:00:00-05:00,23.1333332061768,23.1666679382324,22.6366672515869,22.7446670532227,46210500,0,0,TSLA
"""

In [None]:
# convert `Date` column to datetime type as index
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Check for necessary columns
required_columns = ['Close']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

def calculate_simple_and_exponential_moving_average(df, periods):
    for period in periods:
        # (MA) simple moving average 
        df[f'MA_{period}'] = df['Close'].rolling(window=period).mean()
        # (EMA) exponential moving average
        df[f'EMA_{period}'] = df['Close'].ewm(span=period, adjust=False).mean()
    

def calculate_rsi(data, window=14):
    """
    คำนวณ RSI (Relative Strength Index) สำหรับ DataFrame
    
    Parameters:
    - data: pandas.Series (ราคาปิด Close)
    - window: int (ระยะเวลาในการคำนวณ RSI)
    
    Returns:
    - pandas.Series (RSI)
    """
    delta = data.diff()  # คำนวณการเปลี่ยนแปลงของราคา
    gain = (delta.where(delta > 0, 0)).ewm(span=window, adjust=False).mean()
    loss = (-delta.where(delta < 0, 0)).ewm(span=window, adjust=False).mean()
    
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
        


ma_periods = [7, 14, 30, 100]

calculate_simple_and_exponential_moving_average(df, ma_periods)
df["RSI_14"] = calculate_rsi(df['Close'], window=14)

# Fill NaN values with mean of numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# calculate the relative strength index (RSI)
print("Preprocessing Completed")
print(df.head())


# this modified dataset using `Date` as a index
df.to_csv("../data/yahoo_massive_stock_data_2018-2023_preprocessed.csv", index=True)

In [None]:
""" example preprocessed csv dataset

Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company,MA_7,EMA_7,MA_14,EMA_14,MA_30,EMA_30,MA_100,EMA_100,RSI_14
2018-11-29 00:00:00-05:00,43.829760572993,43.8633538041636,42.6395935832266,43.0835075378418,167080000,0.0,0.0,AAPL,140.09596466303415,43.0835075378418,140.09241066496398,43.0835075378418,140.0905064445423,43.0835075378418,140.09118714755536,43.0835075378418,49.865364818081325
2018-11-29 00:00:00-05:00,104.769074332185,105.519257086357,103.534594914971,104.636131286621,28123200,0.0,0.0,MSFT,140.09596466303415,58.47166347503659,140.09241066496398,51.29052403767903,140.0905064445423,47.05464455389207,140.09118714755536,44.30237137445129,100.0
2018-11-29 00:00:00-05:00,54.1764984130859,55.0074996948242,54.0999984741211,54.7290000915527,31004000,0.0,0.0,GOOGL,140.09596466303415,57.53599762916562,140.09241066496398,51.74898751152885,140.0905064445423,47.54976426599921,140.09118714755536,44.508839269839434,51.66507741377641
2018-11-29 00:00:00-05:00,83.7494964599609,84.4994964599609,82.6165008544922,83.6784973144531,132264000,0.0,0.0,AMZN,140.09596466303415,64.07162255048749,140.09241066496398,56.00625548525208,140.0905064445423,49.88065026912527,140.09118714755536,45.28447606280208,63.47973776310287
2018-11-29 00:00:00-05:00,39.6927840259795,40.0649038762231,38.7351954599368,39.0378532409668,54917200,0.04,0.0,NVDA,140.09596466303415,57.81318022310732,140.09241066496398,53.74380185268072,140.0905064445423,49.18111497698602,140.09118714755536,45.160780561379596,44.23958268103335
2018-11-29 00:00:00-05:00,135.919998168945,139.990005493164,135.660003662109,138.679992675781,24238700,0.0,0.0,META,140.09596466303415,78.02988333627573,140.09241066496398,65.06862729576076,140.0905064445423,54.95523611884376,140.09118714755536,47.01264614780339,68.68464921784869
2018-11-29 00:00:00-05:00,23.1333332061768,23.1666679382324,22.6366672515869,22.7446670532227,46210500,0.0,0.0,TSLA,69.51280702863416,64.20857926551247,140.09241066496398,59.42543259675569,140.0905064445423,52.87713488880369,140.09118714755536,46.532092106326544,43.237230884116045

"""

### **Training model**
___

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout



# we uses 7 features ['MA_7', 'EMA_7', 'MA_14', 'EMA_14', 'MA_30', 'EMA_30'] to predict the next day's [`Close`] price as a target
features = ["Close", "MA_7", "EMA_7", "MA_14", "EMA_14", "MA_30", "EMA_30", "RSI_14"]
# target = "Close"

# grouping the dataset by `Company`
company_groups = df.groupby("Company")
print("Number of companies:", len(company_groups))

# Identify numeric columns for scaling
numeric_cols = df.select_dtypes(include=[np.number]).columns

# apply MinMaxScaler to scale numeric columns only
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[numeric_cols])

# create sequences of time steps
def create_dataset(data, time_steps=30):
    x, y = [], []
    for i in range(len(data) - time_steps):
        x.append(data[i:(i + time_steps)].values)  # ใช้ข้อมูลก่อนหน้า
        y.append(data.iloc[i + time_steps]['Close'])  # ใช้ราคาปิดในวันถัดไปเป็นเป้าหมาย
    return np.array(x), np.array(y)


def train_lstm_model(company_input_shape) -> Sequential:
    model = Sequential()
    model.add(LSTM(units=128, return_sequences=True, input_shape=company_input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=32, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))  # output layer ที่ทำนายราคาปิด
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


# create each company's model
company_model = {}
for company, group in company_groups:
    company_data = group[features]
    
    # create sequences of time steps
    time_steps = 30     # number of time steps previour days to predict the next day's `Close`` price
    x, y = create_dataset(company_data, time_steps)
    
    company_model[company] = (x, y)
    
    
count = 0
for company, (x_train, y_train) in company_model.items():
    count += 1
    print(f"\n#{count} Training model for {company}")
    model = train_lstm_model(company_input_shape=(x_train.shape[1], x_train.shape[2]))
    
    # training the model
    model.fit(x_train, y_train, epochs=20, batch_size=32, verbose=1)
    
    # save the model
    model.save(f"../models/lumina_{company}.h5")
    

print("Training Completed")