## Import dependencies

In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import streamlit as st

## Import dataset

In [None]:
# Coca-Cola stock ticker
ticker = 'KO'
data = yf.download(ticker, start='2015-01-01', end='2023-12-31')
data.shape

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Remove the outer 'Price' level from the columns
data.columns = data.columns.droplevel(1)
data.columns

In [None]:
# Reset the index
data.reset_index(inplace=True)
data.columns

In [None]:
data.head()

In [None]:
# Remove the 'Price' from the column level name
data.columns.name = None
data.columns

In [None]:
data.head()

In [None]:
# # Get the current column names
# current_columns = data.columns.tolist()

# # Create a mapping for the new column names, skipping 'Date'
# new_columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

# data.columns = new_columns

# data.columns

## Check for missing values

In [None]:
data.info()

In [None]:
data.isnull().sum()

## Fill missing numerical values with the column mean

In [None]:
# Forward fill for stock data continuity
data.ffill(inplace=True)

In [None]:
# Filling missing values (NaN) with the value 0
# Replace remaining missing dividends / splits with 0
data.fillna(0, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.head()

## Add moving averages

In [None]:
data['MA_20'] = data['Close'].rolling(window=20).mean()
data['MA_50'] = data['Close'].rolling(window=50).mean()

In [None]:
data.head()

## Add daily returns

In [None]:
data['Daily_Return'] = data['Close'].pct_change()

In [None]:
data.head()

## Add volatility (standard deviation of returns over a rolling window)

In [None]:
data['Volatility'] = data['Daily_Return'].rolling(window=20).std()

In [None]:
data.head()

## Drop rows with NA due to rolling calculations

In [None]:
data.shape

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data.head()

## Summary statistics

In [None]:
data.describe()

## Line plot for stock prices

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['Close'], label='Close Price')
plt.plot(data['Date'], data['MA_20'], label='MA 20', linestyle='--')
plt.plot(data['Date'], data['MA_50'], label='MA 50', linestyle='--')
plt.title('Coca-Cola Stock Prices with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

## Correlation heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Left merging columns

In [None]:
KO_hist = pd.read_csv('Coca-Cola_stock_history.csv')
KO_hist.shape

In [None]:
KO_hist.head()

In [None]:
data.head()

In [None]:
# Convert 'Date' in KO_hist and then make it timezone-naive
KO_hist['Date'] = pd.to_datetime(KO_hist['Date'], format='mixed', utc=True).dt.tz_localize(None)

# Convert 'Date' in data (assuming the previous conversion was successful)
data['Date'] = pd.to_datetime(data['Date'])

# Print the data types of the 'Date' columns to verify
# print(f"\nData type of KO_hist['Date']: {KO_hist['Date'].dtype}")
# print(f"Data type of data['Date']: {data['Date'].dtype}")

# Merge the two dataframes based on the 'Date' column
data = pd.merge(data, KO_hist[['Date', 'Dividends', 'Stock Splits']], on='Date', how='left')

# Fill NaN values in the new columns with 0
data['Dividends'] = data['Dividends'].fillna(0)
data['Stock Splits'] = data['Stock Splits'].fillna(0)

In [None]:
data.head()

## Data Splitting

In [None]:
data.head()

In [None]:
features = ['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits', 'MA_20', 'MA_50', 'Daily_Return', 'Volatility']
target = 'Close'

In [None]:
X = data[features]
y = data[target]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Model Training

In [None]:
# Use Random Forest for Initial Predictions. A good baseline model for tabular data is Random Forest.

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

## Fetching Live Data

In [None]:
# Fetch latest stock data

live_data = yf.download(ticker, period='1d', interval='1m')
live_data.shape

In [None]:
live_data.columns = live_data.columns.droplevel(1)
live_data.reset_index(inplace=True)
live_data.columns.name = None
live_data.head()

In [None]:
# Adding features

live_data['MA_20'] = live_data['Close'].rolling(window=20).mean()
live_data['MA_50'] = live_data['Close'].rolling(window=50).mean()
live_data['Daily_Return'] = live_data['Close'].pct_change()
live_data['Volatility'] = live_data['Daily_Return'].rolling(window=20).std()

In [None]:
live_data.head()

In [None]:
# Ensure no missing values

live_data.fillna(0, inplace=True)

In [None]:
live_data.head()

In [None]:
live_data['Datetime'] = pd.to_datetime(live_data['Datetime'])
live_data['Date'] = live_data['Datetime'].dt.date

In [None]:
live_data.head()

In [None]:
KO_hist.head()

In [None]:
# Convert 'Date' in KO_hist and then make it timezone-naive
KO_hist['Date'] = pd.to_datetime(KO_hist['Date'], format='mixed', utc=True).dt.tz_localize(None)

# Convert 'Date' in live_data (assuming the previous conversion was successful)
live_data['Date'] = pd.to_datetime(live_data['Date'])

# Merge the two live_dataframes based on the 'Date' column
live_data = pd.merge(live_data, KO_hist[['Date', 'Dividends', 'Stock Splits']], on='Date', how='left')

# Fill NaN values in the new columns with 0
live_data['Dividends'] = live_data['Dividends'].fillna(0)
live_data['Stock Splits'] = live_data['Stock Splits'].fillna(0)

In [None]:
live_data.head()

In [None]:
# latest data point

latest_features = live_data[features].iloc[-1:].dropna()
latest_features.shape

In [None]:
latest_features.head()

In [None]:
live_prediction = model.predict(latest_features)

In [None]:
print(f"Predicted Closing Price: {live_prediction[0]}")

In [None]:
# st.title('Coca-Cola Stock Price Prediction')
# # Upload visualization
# st.line_chart(data[['Close', 'MA_20', 'MA_50']])
# # Show prediction
# st.write(f"Predicted Closing Price: {live_prediction[0]}")