# Stock trading agent

## Data
We have a folder from https://www.kaggle.com/datasets/camnugent/sandp500 which includes the prices every market open day from 08-02-2013 to 08-02-2018 fro over 500 companies. These are companies in the S&P 500 and are the top 500 markets in the US, furthermore they represent the shares of the most commonly traded companies in the stock market.

The data has 5 columns for each company in its own file named by its stock name:
- Date: yy-mm-dd
- Open: price of stock at market open
- High: Highest price of stock in day
- Low: Lowest price of stock in day
- Close: Price of closing
- Volume: How many stocks traded that day for that company
- Name: The ticker name of the company in the stock market

In [29]:
from pathlib import Path

def count_files(directory_path):
    path = Path(directory_path)
    count = len([p for p in path.iterdir() if p.is_file()])
    return count

directory = 'individual_stocks_5yr'
file_count = count_files(directory)
print(f"Number of files in '{directory}': {file_count}")

Number of files in 'individual_stocks_5yr': 505


### Reading the data
Here we read the csv files for the first 100 companies and store the csv file in a panda dataframe and then store the 100 dataframes in a simple list.

In [30]:
# Schema: date | open | high | low | close | volume | name

# import os

# folder_path = 'individual_stocks_5yr'

# entries = os.listdir(folder_path)

# import pandas as pd

# stock_prices = []

# for i in range(100):
#     filename = folder_path + "/" + entries[i]
#     try:
#         df = pd.read_csv(filename)
#         stock_prices.append(df)
#     except:
#         print("Error: Error reading file (" + filename + ")")

# # A vector with the days for conveinience later
# calendar = stock_prices[0]['date']

# # print(len(stock_prices))


In [31]:
# Schema: date | open | high | low | close | volume | name

from pathlib import Path
import pandas as pd

def read_data(folder_path, amount = None):
    path = Path(folder_path)
    files = []

    for f in path.iterdir():
        if f.is_file() and f.suffix.lower() == '.csv':
            filename = f
            try:
                df = pd.read_csv(filename)
                files.append(df)
                if  amount:
                    amount -= 1
                    if amount == 0:
                        return files
            except:
                print("Error: Error reading file (" + filename + ")")

    return files

In [32]:
def inspect_data(comp_list):
    def check_for_na(comp_df):
        total_nas = comp_df.isna().sum().sum()
        uniform = True

        if total_nas != 0:
            print(f"Null values found. File: {comp_df.iat[0, 6]}. Count: {total_nas}")
            uniform = False
        return uniform

    def check_dfs_na(comp_list):
        print("Checking for null values:")
        if comp_list:
            for df in comp_list:
                unifrom = check_for_na(df)
        else:
            print("Error: No dataframes in comp_list!")
    def check_length(comp_list):
        print("Checking sizes:")
        default_shape = comp_list[0].shape
        uniform = True

        for i, df in enumerate(comp_list):
            if df.shape != default_shape:
                print(f"Unequal size detected. Deffault: {default_shape}. Detected size: {df.shape} in file {df.iat[0, 6]}")
                uniform = False
        if uniform:
            print("All dfs same shape")            

    def check_dates(comp_list):
        print("Checking for unequal dates:")
        calendar = comp_list[0]['date']
        uniform = True

        for i, df in enumerate(comp_list):
            if not df['date'].equals(calendar):
                print(f"Unequal dates detected for file {df.iat[0, 6]}")
                uniform = False
        if uniform:
            print('All dates are equal')
    
    if not comp_list or len(comp_list) <= 1:
            return
    
    check_dfs_na(comp_list)
    check_length(comp_list)
    check_dates(comp_list)

The above code shows that there are indeed Null values and unequal data for some companies in our dataset.

In [33]:
def clean_data(comp_list):
    if not comp_list or len(comp_list) == 0:
        print("Error: Input empty")
        return
    
    print("Cleaning dataframes from NaNs")
    counter = 0
    clean_list = []

    for df in comp_list:
        if df.isnull().values.any():
            print(f"NaNs detected in {df.iat[0, 6]}, will be dropped")
            counter += 1
        else:
            clean_list.append(df)

    print(f"NaN dataframes dropped {counter}, will equalize length now.")

    min_days = 730
    valid_companies = [df for df in clean_list if df.shape[0] >= min_days]

    date_sets = [set(df['date']) for df in valid_companies]

    common_dates = set.intersection(*date_sets)

    aligned_dfs = [
        df[df['date'].isin(common_dates)].sort_values('date').reset_index(drop=True)
        for df in valid_companies
    ]

    return aligned_dfs


In [34]:
comp_list = (read_data(folder_path='individual_stocks_5yr', amount=100))
inspect_data(comp_list)

Checking for null values:
Checking sizes:
Unequal size detected. Deffault: (1259, 7). Detected size: (975, 7) in file GOOG
Unequal size detected. Deffault: (1259, 7). Detected size: (781, 7) in file QRVO
Unequal size detected. Deffault: (1259, 7). Detected size: (1063, 7) in file ALLE
Checking for unequal dates:
Unequal dates detected for file GOOG
Unequal dates detected for file QRVO
Unequal dates detected for file ALLE


In [35]:
clean_list = clean_data(comp_list)
inspect_data(clean_list)
comp_count = len(clean_list)
print(f"We now have: {len(clean_list)}")

Cleaning dataframes from NaNs
NaN dataframes dropped 0, will equalize length now.
Checking for null values:
Checking sizes:
All dfs same shape
Checking for unequal dates:
All dates are equal
We now have: 100


Now all our data for all chosen companies are equal.

In [36]:
def momentum(comp_df, curr, n):
    price_n_ago = comp_df.iat[curr - n, 4]

    curr_price = comp_df.iat[curr, 4]
            
    return (curr_price - price_n_ago) / price_n_ago

def get_momentums(comp_list, curr):
    momentums = {}
    n = 5
    if curr < 5:
        raise ValueError("Error: Current date is less than n!")
    

    for comp in comp_list:
        momentums[comp.iat[0][6]] = momentum(comp, curr, n)

    return momentums



In [None]:
num_days = 700


n = 10

budget = 1000000
purchase_amount = 100

purchases = {}
sells = []

momentum_day = []
for i in range(num_days):
    momentums = {}

    for j in range(comp_count):
        if i >= n:
            comp_df = clean_list[j]

            price_n_ago = comp_df.iat[i - n, 1]

            curr_price = comp_df.iat[i, 1]

            curr_date = comp_df.iat[i, 0]
            
            momentum = (curr_price - price_n_ago) / price_n_ago
            stock_name = comp_df.iat[i, 6]
            momentums[stock_name] = momentum
            if momentum > 0 and budget > purchase_amount:
                # Key is stock name, list contains [how many stocks purchased, price at purchase, purchase date]
                new_purchase = [purchase_amount / curr_price, curr_price, curr_date]
                if stock_name in purchases:
                    purchases[stock_name].append(new_purchase)
                else:
                    purchases[stock_name] = [new_purchase, ]
                budget -= purchase_amount

            elif momentum < 0 and stock_name in purchases and purchases[stock_name]:
                old_purchases = [purchase for purchase in purchases[stock_name]]

                purchase_profit = 0
                for old_purchase in old_purchases:
                    purchase_profit += old_purchase[0] * curr_price
                new_sell = {stock_name: [purchase_profit / curr_price, purchase_profit, curr_date]}
                sells.append(new_sell)
                del purchases[stock_name]
                budget += purchase_profit
    momentum_day.append(momentums)

# print(budget)

initial_budget = 10000

last_day_index = num_days - 1

unrealized = 0
for stock_name, buys in purchases.items():

    comp_df = next(df for df in clean_list if df.iat[0, 6] == stock_name)
    curr_price = comp_df.iat[last_day_index, 4]
    
    for qty, _, _ in buys:
        unrealized += qty * curr_price

portfolio_value = budget + unrealized
profit = portfolio_value - initial_budget

print("Final cash:", budget)
print("Unrealized value:", unrealized)
print("Total portfolio value:", portfolio_value)
print("Total profit:", profit)



Final cash: 909303.2977473453
Unrealized value: 96351.62880875052
Total portfolio value: 1005654.9265560958
Total profit: 995654.9265560958
