In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score


#We chose linear regression for this problem because we want to predict the stock price which is a number and not a category,
#linear regression is simple, easy to interpret and works well with this kind of machine learning problems,
#the regression finds a relationship between the date and the stock price, it is quick and reliable. 

GME_df = pd.read_csv('https://raw.githubusercontent.com/DAVE3625/Dave3625-Host-2025/refs/heads/main/Mandatory%20Assignments/MA2/data/GME_stock.csv')

#Preview of the dataset 
GME_df.head()

In [None]:
#We need to preprocess the data first
#We can see that the date is a categorical feature, we need to convert the date into a numerical feture
print(GME_df.info())  

In [None]:
#Conversion of the date from string to an ordinal number, which is a numerical representation of the date starting from jan 1, year 1
GME_df['date'] = pd.to_datetime(GME_df['date'])
GME_df['date_ordinal'] = GME_df['date'].map(pd.Timestamp.toordinal)

In [None]:
#Verifying conversion, all good 
print(GME_df.info())

In [None]:
#Let's also check for any null values, there are none 
GME_df.isnull().sum()

In [None]:
#We can also see that the adjclose_price row has the same data as in the close_price row, if this is the case for all colums,
#then we can safely drop this row because this is redundant data.
#Let's check if this is the case for all the rows 
(GME_df['close_price'] == GME_df['adjclose_price']).all()

In [None]:
#We can see that this is not the case, the two rows have different values, we will therefore keep both the rows
GME_df[GME_df['close_price'] != GME_df['adjclose_price']]   

In [None]:
#We now need to select our features and targets, the assignment says that the input should be the date and the output should be the close price,
#we will therefore pick those two to keep it simple 
X = GME_df[['date_ordinal']]
y = GME_df['close_price']

#We will now split the data into training and testing sets, we will test 20% of the dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Training the model 
model = LinearRegression()
model.fit(X_train, y_train)

#Then we use our model to predict the close price for the 20% test set 
y_pred = model.predict(X_test)

#We then need to evaluate the model, since predicting stock price is a continous number, a confusion matrix won't work here,
#we will instead use regression metrics such as Mean Squared Error (MSE), which tells us how far off the predictions are
#and R² score, which tells us how well the model explains the variation in the data.

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

#the model is not accurate at all, need to add more features or change the regression model 