<a href="https://colab.research.google.com/github/SoukainaElidrissi/Atelier1/blob/main/AtelierEx1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nyse:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F854%2F1575%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240310%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240310T144226Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D647fe2f02e5b79b7e86222eb7871952645bf573796d2949877082b3d5032829081743ab9ed58bf144d9f196a927070de2166af1d3e03c538d8c352786484007393b0bf15e45eea6b0037db495410be7a118dad3c3537d06520ed1d7c1e3d980b591eabae74db5da9dea3d5fb8ec6127088775c5dda6ea464350cd7b8e0d8a69963c0efa4c5a7062ce4f7ccac8e10a27bded8f78cf6f3c291c0d2ec960c208a7e00d4e811dbb44cf7dc8f9b6586f39b3d5ecd7997ccf24d32091192c07b1c22ed6a1db796f644176746524ed90718cf239e614a89c6894f9c2632cfe708641d148ee66d85ed8963bda0676dc5fc74e0bc64a03bd786cbc03e030acafb660a2230'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#Part 1: Exploratory Data Analysis (EDA)
#Prices-split-adjusted
df = pd.read_csv("/kaggle/input/nyse/prices-split-adjusted.csv", index_col = 0)
df.info()
df.head()
df.dropna(inplace=True)
df.isnull().sum()
df.describe()

In [None]:
import matplotlib.pyplot as plt

symbols_to_plot = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']
plt.figure(figsize=(10, 6))
for symbol in symbols_to_plot:
    plt.plot(df[df['symbol'] == symbol].index, df[df['symbol'] == symbol]['close'], label=symbol)
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('Closing Prices of Select Symbols over Time')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df[df['symbol'] == 'AAPL'].index, df[df['symbol'] == 'AAPL']['volume'])
plt.xlabel('Date')
plt.ylabel('Volume')
plt.title('Trading Volume of AAPL over Time')
plt.show()

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
sns.pairplot(df)
plt.tight_layout()
plt.show()

In [None]:
X = df.drop(['close'], axis=1)
y = df['close']

In [None]:
#Prices
prices_dataframe = pd.read_csv('/kaggle/input/nyse/prices.csv')
prices_dataframe.info()
prices_dataframe.head()
prices_dataframe.dropna(inplace=True)
prices_dataframe.isnull().sum()
prices_dataframe.describe()


In [None]:
print(prices_dataframe.columns)

In [None]:
prices_dataframe = prices_dataframe.loc[prices_dataframe['symbol']=='AAPL']
prices_dataframe.drop('symbol', axis=1, inplace=True)
print(np.shape(prices_dataframe))
prices_dataframe.head()

In [None]:
# changing the index to date

prices_dataframe['date'] = pd.to_datetime(prices_dataframe['date'])
prices_dataframe = prices_dataframe.set_index('date')

prices_dataframe.head()

In [None]:
# Plotting individual numerical columns over time
plt.figure(figsize=(10, 6))

plt.subplot(3, 2, 1)
plt.plot(prices_dataframe.index, prices_dataframe['open'])
plt.xlabel('Date')
plt.ylabel('Open Price')
plt.title('Open Price over Time')

plt.subplot(3, 2, 2)
plt.plot(prices_dataframe.index, prices_dataframe['close'])
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('Close Price over Time')

plt.subplot(3, 2, 3)
plt.plot(prices_dataframe.index, prices_dataframe['low'])
plt.xlabel('Date')
plt.ylabel('Low Price')
plt.title('Low Price over Time')

plt.subplot(3, 2, 4)
plt.plot(prices_dataframe.index, prices_dataframe['high'])
plt.xlabel('Date')
plt.ylabel('High Price')
plt.title('High Price over Time')

plt.subplot(3, 2, 5)
plt.plot(prices_dataframe.index, prices_dataframe['volume'])
plt.xlabel('Date')
plt.ylabel('Volume')
plt.title('Volume over Time')

plt.tight_layout()
plt.show()


In [None]:
prices_dataframe = prices_dataframe.sample(frac=0.75, replace=False, random_state=7)
prices_dataframe['year'] = prices_dataframe.index.year
prices_dataframe['month'] = prices_dataframe.index.month
prices_dataframe['day'] = prices_dataframe.index.day

In [None]:
#Fundamentals
fundamentals_dataframe = pd.read_csv('/kaggle/input/nyse/fundamentals.csv', index_col=['Unnamed: 0'])
fundamentals_dataframe.info()
fundamentals_dataframe.head()
fundamentals_dataframe.dropna(inplace=True)
fundamentals_dataframe.isnull().sum()
fundamentals_dataframe.describe()

In [None]:
fundamentals_dataframe.isnull().sum(axis=0)

missing_columns = ['Current Ratio', 'Cash Ratio', 'Quick Ratio', 'For Year', 'Earnings Per Share', 'Estimated Shares Outstanding']
for column in missing_columns:
    median_value = fundamentals_dataframe[column].median()
    fundamentals_dataframe[column] = fundamentals_dataframe[column].fillna(median_value)

In [None]:
#Securities
securities_dataframe = pd.read_csv('/kaggle/input/nyse/securities.csv')
securities_dataframe.info()
securities_dataframe.head()
securities_dataframe.dropna(inplace=True)
securities_dataframe.isnull().sum()
securities_dataframe.describe()

In [None]:
securities_dataframe=securities_dataframe.dropna(subset=['Date first added'])

In [None]:
securities_dataframe.info()

In [None]:
from numpy import vstack,sqrt
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset,DataLoader,random_split
from torch import Tensor
from torch.nn import ReLU,Module,MSELoss,Linear
from torch.optim import SGD
from torch.nn.init import xavier_uniform_
from tqdm import tqdm

# dataset definition perparation
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path, header=None)

        df.head()
        # store the inputs and outputs
        self.X = df.values[:, :-1].astype('float32')
        self.y = df.values[:, -1].astype('float32')
        # ensure target has the right shape
        self.y = self.y.reshape((len(self.y), 1))

    # number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 10)
        xavier_uniform_(self.hidden1.weight)
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(10, 8)
        xavier_uniform_(self.hidden2.weight)
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(8, 1)
        xavier_uniform_(self.hidden3.weight)

    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        return X

# prepare the dataset
def prepare_data(path):
    # load the dataset
    dataset = CSVDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=32, shuffle=False)
    return train_dl, test_dl

# train the model
def train_model(train_dl, model):
    size = len(train_dl.dataset)
    # define the optimization
    criterion = MSELoss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    # enumerate epochs
    # enumerate epochs
    for epoch in tqdm(range(100),desc='Training Epochs'):
        print(f"Epoch {epoch+1}\n-------------------------------")
        # enumerate mini batches
        for batch, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(inputs)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# evaluate the model
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
    mse = mean_squared_error(actuals, predictions)
    return mse
# make a class prediction for one row of data
def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

In [None]:
data = pd.read_csv("/kaggle/input/nyse/prices-split-adjusted.csv")
df = pd.DataFrame(data)
X = df.drop(['date', 'symbol', 'volume'], axis=1)
X = X[['low', 'high', 'open', 'close']]

csv_path = '/kaggle/working/MyWork.csv'
csv_path1 = '/kaggle/working/MyWork1.csv'

X.to_csv(csv_path, index=False)
X = pd.read_csv(csv_path, skiprows=1)
X.to_csv(csv_path1, index=False)

train_dl, test_dl = prepare_data(csv_path1)
print(len(train_dl.dataset), len(test_dl.dataset))
model = MLP(3)
train_losses = train_model(train_dl, model)