# Data Cleaning

In [None]:
import pandas as pd

# Specify the file names
file_names = ["prices_round_3_day_0.csv", "prices_round_3_day_1.csv", "prices_round_3_day_2.csv"]

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through the file names
for file_name in file_names:
    # Read each CSV file with semicolon delimiter and append it to the list of DataFrames
    df = pd.read_csv(file_name, delimiter=';')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

# Check the columns of the consolidated DataFrame
print(data.columns)

In [None]:
import pandas as pd


# Separate the data into individual DataFrames for each product
products = ["CHOCOLATE", "STRAWBERRIES", "ROSES", "GIFT_BASKET"]
dfs = {}
for product in products:
    df_product = data[data["product"] == product][["product", "mid_price"]]
    df_product.reset_index(drop=True, inplace=True)
    dfs[product] = df_product

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Extract mid prices for each product into a DataFrame
product_mid_prices = {}
for product, df_product in dfs.items():
    product_mid_prices[product] = df_product["mid_price"]


# Create a DataFrame from the dictionary
df_mid_prices = pd.DataFrame(product_mid_prices)
df_mid_prices['GIFTS'] = 4 * df_mid_prices['CHOCOLATE'] + 6 * df_mid_prices['STRAWBERRIES'] + df_mid_prices['ROSES']

print(df_mid_prices)

In [None]:
import pandas as pd

# Assume dfs is a dictionary containing individual DataFrames for each product

# List of products
products = ["STRAWBERRIES", "ROSES", "GIFT_BASKET", "CHOCOLATE"]

# Initialize a dictionary to store statistics for each product
stats = {}

# Iterate over each product
for product in products:
    # Get the DataFrame for the product
    df_product = dfs[product]
    
    
    # Calculate the descriptive statistics
    product_stats = df_product["mid_price"].describe()
    
    # Store the statistics
    stats[product] = product_stats

# Display the statistics for each product
for product, product_stats in stats.items():
    print(f"Statistics for {product}:")
    print(product_stats)
    print()

In [None]:
import pandas as pd

# Assume dfs is a dictionary containing individual DataFrames for each product

# List of products
products = ["STRAWBERRIES", "ROSES", "GIFT_BASKET", "CHOCOLATE"]

# Initialize a dictionary to store statistics for each product
stats = {}

# Iterate over each product
for product in products:
    # Get the DataFrame for the product
    df_product = dfs[product]
    
    # Calculate the absolute differences between consecutive mid_prices
    abs_diff = df_product["mid_price"].diff().abs()
    
    # Calculate the average absolute difference
    avg_abs_diff = abs_diff.mean()
    
    # Store the average absolute difference
    stats[product] = avg_abs_diff

# Display the average absolute difference for each product
for product, avg_abs_diff in stats.items():
    print(f"Average Absolute Difference for {product}: {avg_abs_diff}")

# Linear Regression

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define a range of window sizes to test
window_sizes = range(1, 6)  # Test window sizes from 1 to 6

# Iterate over commodities
for commodity in ["CHOCOLATE", "STRAWBERRIES", "ROSES"]:
    print(f"Results for {commodity}:")
    # Initialize variables to store results
    results = []

    # Iterate over window sizes
    for window_size in window_sizes:
        # Create a dataframe with window_size columns containing the previous prices
        df = pd.DataFrame()
        for i in range(window_size):
            df[f'last_price_{i+1}'] = dfs[commodity]['mid_price'].shift(i+1)
        df['mid_price'] = dfs[commodity]['mid_price']  # Add the target variable

        # Drop rows with NaN values (due to shifting)
        df.dropna(inplace=True)

        # Prepare the data
        X = df.drop('mid_price', axis=1)
        y = df['mid_price']

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create and train the linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Evaluate the model
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)
        train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
        test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

        intercept = model.intercept_
        coefficients = model.coef_
        equation = f"y = {intercept:.2f}"
        for i, coef in enumerate(coefficients):
            equation += f" + ({coef:.2f} * last_price_{i+1})"
        print(f"Equation for window size {window_size}:")
        print(equation)

        # Store the results
        results.append((window_size, train_rmse, test_rmse))

    # Initialize variables to store the optimal window size and corresponding test RMSE
    optimal_window_size = None
    min_test_rmse = float('inf')

    # Iterate over the results to find the optimal window size
    for window_size, train_rmse, test_rmse in results:
        print("Window Size:", window_size)
        print("Train RMSE:", train_rmse)
        print("Test RMSE:", test_rmse)
        print()

        # Update the optimal window size if the current test RMSE is lower
        if test_rmse < min_test_rmse:
            optimal_window_size = window_size
            min_test_rmse = test_rmse

    # Print the optimal window size and corresponding test RMSE
    print("Optimal Window Size:", optimal_window_size)
    print("Corresponding Test RMSE:", min_test_rmse)
    print()

In [None]:
# Assuming dfs is a dictionary containing individual DataFrames for each product
# Calculate the average strawberry price
average_strawberry_price = dfs["STRAWBERRIES"]["mid_price"].mean()

print("Average Strawberry Price:", average_strawberry_price)

# Comparing Gift Basket and Totals

In [None]:
import matplotlib.pyplot as plt

# Plot GIFT_BASKET and GIFTS side by side
plt.figure(figsize=(10, 6))
plt.plot(df_mid_prices.index, df_mid_prices['GIFT_BASKET'], label='GIFT_BASKET')
plt.plot(df_mid_prices.index, df_mid_prices['GIFTS'], label='GIFTS')
plt.xlabel('Index')
plt.ylabel('Price')
plt.title('Comparison of GIFT_BASKET and GIFTS')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import numpy as np

# Calculate the difference between GIFT_BASKET and GIFTS
difference = df_mid_prices['GIFT_BASKET'] - df_mid_prices['GIFTS']

# Compute the statistics
difference_stats = difference.describe()

# Print the statistics
print(difference_stats)

In [None]:
import matplotlib.pyplot as plt

# Plot the histogram
plt.hist(difference, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Difference between GIFT_BASKET and GIFTS')
plt.xlabel('Difference')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Calculate the difference between actual and predicted gift basket costs
actual_predicted_diff = df_mid_prices['GIFT_BASKET'] - predicted_cost

# Calculate the difference between "GIFT_BASKET" and "GIFTS" columns
gifts_diff = df_mid_prices['GIFT_BASKET'] - df_mid_prices['GIFTS']

# Plot both differences on the same graph
plt.figure(figsize=(10, 6))
plt.plot(actual_predicted_diff, label='Actual - Predicted')
plt.plot(gifts_diff, label='GIFT_BASKET - GIFTS')
plt.title('Difference: Actual vs. Predicted Gift Basket Cost & GIFT_BASKET vs. GIFTS')
plt.xlabel('Sample')
plt.ylabel('Difference')
plt.legend()
plt.grid(True)
plt.show()

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Prepare the features (components) and target (cost)
X = df_mid_prices[['CHOCOLATE', 'STRAWBERRIES', 'ROSES']]
y = df_mid_prices['GIFT_BASKET']

# Define the linear regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Print the coefficients
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

import numpy as np
import matplotlib.pyplot as plt

# Predict the cost of the gift basket
predicted_cost = model.predict(X)

# Calculate RMSE
rmse = np.sqrt(((predicted_cost - y) ** 2).mean())
# Print the RMSE
print('RMSE:', rmse)

In [None]:
import matplotlib.pyplot as plt

# Assuming 'y' contains the actual gift basket costs, and 'predicted_cost' contains the predicted values
# Plot the actual and predicted values
plt.figure(figsize=(10, 6))
plt.plot(y, label='Actual Gift Basket Cost')
plt.plot(predicted_cost, label='Predicted Gift Basket Cost')
plt.plot(df_mid_prices['GIFTS'], label='GIFTS Column')
plt.title('Actual vs. Predicted vs. GIFTS Gift Basket Cost')
plt.xlabel('Sample')
plt.ylabel('Cost')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the commodities making up the gift basket
plt.figure(figsize=(10, 6))
plt.plot(df_mid_prices['CHOCOLATE'], label='CHOCOLATE')
plt.plot(df_mid_prices['STRAWBERRIES'], label='STRAWBERRIES')
plt.plot(df_mid_prices['ROSES'], label='ROSES')
plt.title('Commodities Making Up the Gift Basket')
plt.xlabel('Sample')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

# Component Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_mid_prices)

# Perform PCA
pca = PCA()
pca.fit(scaled_data)

# Get the principal components
components = pca.components_

# Print the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:")
print(explained_variance_ratio)

# Plot the explained variance ratio
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio by Principal Component')
plt.show()

In [None]:
# Calculate correlation coefficients
correlation_matrix = df_mid_prices.corr()

# Visualize the correlation using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Mid Prices")
plt.xlabel("Product")
plt.ylabel("Product")
plt.show()

# Manual Trading

# Expedition 1

In [None]:
import matplotlib.pyplot as plt

# Given variables
expedition_costs = [0, 25000, 75000]
multipliers = [24, 70, 41, 21, 60, 47, 82, 87, 80, 35, 73, 89, 100, 90, 17, 77, 83, 85, 79, 55, 12, 27, 52, 15, 30]
hunters = [2, 4, 3, 2, 4, 3, 5, 5, 5, 3, 4, 5, 8, 7, 2, 5, 5, 5, 5, 4, 2, 3, 4, 2, 3]
base_treasure = 7500

# Initialize a dictionary to store total profits for each multiplier
total_profits_one = {multiplier: [] for multiplier in multipliers}

# Calculate total profit for each percentage from 0 to 100 for each multiplier
for percentage in range(101):
    for i, multiplier in enumerate(multipliers):
        prize = (base_treasure * multiplier) / (hunters[i] + percentage)
        profit = prize - expedition_costs[0]  # Assuming only considering the second expedition
        total_profits_one[multiplier].append(profit)

# Create plot
plt.figure(figsize=(10, 6))
for multiplier, profits in total_profits_one.items():
    plt.plot(range(101), profits, label=f"Multiplier {multiplier}", marker='o', linestyle='-')

plt.title('Total Profit vs. Percentage of Hunters')
plt.xlabel('Percentage of Hunters')
plt.ylabel('Total Profit')
plt.grid(True)
plt.xticks(range(0, 101, 5))
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Calculate the sum of total profits for each multiplier
sum_profits_one = {multiplier: sum(profits) for multiplier, profits in total_profits_one.items()}

# Rank the multipliers based on the sum of total profits
ranked_multipliers = sorted(sum_profits_one, key=sum_profits_one.get, reverse=True)

# Display the ranked multipliers
print("Ranked Multipliers based on Sum of Total Profits:")
for rank, multiplier in enumerate(ranked_multipliers, start=1):
    sum_profit = sum_profits_one[multiplier]
    print(f"Rank {rank}: Multiplier - {multiplier}, Sum of Total Profits - {sum_profit}")

In [None]:
import matplotlib.pyplot as plt

# Given variables
expedition_costs = [0, 25000, 75000]
multipliers = [24, 70, 41, 21, 60, 47, 82, 87, 80, 35, 73, 89, 100, 90, 17, 77, 83, 85, 79, 55, 12, 27, 52, 15, 30]
hunters = [2, 4, 3, 2, 4, 3, 5, 5, 5, 3, 4, 5, 8, 7, 2, 5, 5, 5, 5, 4, 2, 3, 4, 2, 3]
base_treasure = 7500

# Initialize a dictionary to store total profits for each multiplier
total_profits_one = {multiplier: [] for multiplier in multipliers}

# Calculate total profit for each percentage from 0 to 100 for each multiplier
for percentage in range(40, 101):
    for i, multiplier in enumerate(multipliers):
        prize = (base_treasure * multiplier) / (hunters[i] + percentage)
        profit = prize - expedition_costs[0]  # Assuming only considering the second expedition
        total_profits_one[multiplier].append(profit)

# Create plot
plt.figure(figsize=(10, 6))
for multiplier, profits in total_profits_one.items():
    plt.plot(range(40, 101), profits, label=f"Multiplier {multiplier}", marker='o', linestyle='-')

plt.title('Total Profit vs. Percentage of Hunters')
plt.xlabel('Percentage of Hunters')
plt.ylabel('Total Profit')
plt.grid(True)
plt.xticks(range(25, 101, 5))
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Calculate the sum of total profits for each multiplier
sum_profits_one = {multiplier: sum(profits) for multiplier, profits in total_profits_one.items()}

# Rank the multipliers based on the sum of total profits for the percentage range from 50 to 100
ranked_multipliers_one = sorted(sum_profits_one, key=sum_profits_one.get, reverse=True)

# Display the ranked multipliers
print("Ranked Multipliers based on Maximum Total Profit for Expedition 1 (Percentage Range: 50 to 100):")
for rank, multiplier in enumerate(ranked_multipliers_one, start=1):
    sum_profit = sum_profits_one[multiplier]
    print(f"Rank {rank}: Multiplier - {multiplier}, Maximum Total Profit - {sum_profit}")

In [None]:
# Calculate the ratio of multiplier to hunters for each multiplier
ratio_multiplier_hunters = {multiplier: multiplier / hunter for multiplier, hunter in zip(multipliers, hunters)}

# Rank the multipliers based on the ratio of multiplier to hunters
ranked_multipliers = sorted(ratio_multiplier_hunters, key=ratio_multiplier_hunters.get, reverse=True)

# Display the ranked multipliers
print("Ranked Multipliers based on Multiplier to Hunter Ratio:")
for rank, multiplier in enumerate(ranked_multipliers, start=1):
    ratio = ratio_multiplier_hunters[multiplier]
    print(f"Rank {rank}: Multiplier - {multiplier}, Ratio - {ratio}")

In [None]:
import matplotlib.pyplot as plt

# Given variables
expedition_costs = [0, 25000, 75000]
multipliers = [24, 70, 41, 21, 60, 47, 82, 87, 80, 35, 73, 89, 100, 90, 17, 77, 83, 85, 79, 55, 12, 27, 52, 15, 30]
hunters = [2, 4, 3, 2, 4, 3, 5, 5, 5, 3, 4, 5, 8, 7, 2, 5, 5, 5, 5, 4, 2, 3, 4, 2, 3]
base_treasure = 7500

# Initialize a dictionary to store total profits for each multiplier
total_profits_two = {multiplier: [] for multiplier in multipliers}

# Calculate total profit for each percentage from 0 to 100 for each multiplier
for percentage in range(10,26):
    for i, multiplier in enumerate(multipliers):
        prize = (base_treasure * multiplier) / (hunters[i] + percentage)
        profit = prize - expedition_costs[1]  # Assuming only considering the second expedition
        total_profits_two[multiplier].append(profit)

# Create plot
plt.figure(figsize=(10, 6))
for multiplier, profits in total_profits_two.items():
    plt.plot(range(10, 26), profits, label=f"Multiplier {multiplier}", marker='o', linestyle='-')

plt.title('Total Profit vs. Percentage of Hunters')
plt.xlabel('Percentage of Hunters')
plt.ylabel('Total Profit')
plt.grid(True)
plt.xticks(range(0, 30, 1))
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Initialize a dictionary to store the highest percentage value where profit is positive for each multiplier
highest_positive_percentage = {multiplier: -1 for multiplier in multipliers}

# Iterate over each multiplier
for multiplier, profits in total_profits_two.items():
    # Find the highest percentage value where profit is positive
    for i, profit in enumerate(profits):
        if profit >= 0:
            highest_positive_percentage[multiplier] = i

# Rank the multipliers based on the highest positive percentage value
ranked_multipliers = sorted(highest_positive_percentage, key=highest_positive_percentage.get, reverse=True)

# Display the ranked multipliers
print("Ranked Multipliers based on Highest Positive Percentage Value:")
for rank, multiplier in enumerate(ranked_multipliers, start=1):
    percentage_value = highest_positive_percentage[multiplier]
    print(f"Rank {rank}: Multiplier - {multiplier}, Highest Positive Percentage Value - {percentage_value}")

In [None]:
# Initialize a dictionary to store the highest percentage value where profit is positive for each multiplier
highest_positive_percentage = {multiplier: -1 for multiplier in multipliers}

# Iterate over each multiplier
for multiplier, profits in total_profits_two.items():
    # Find the highest percentage value where profit is positive
    for i, profit in enumerate(profits):
        if profit >= 0:
            highest_positive_percentage[multiplier] = i

# Rank the multipliers based on the highest positive percentage value
ranked_multipliers = sorted(highest_positive_percentage, key=highest_positive_percentage.get, reverse=True)

# Display the ranked multipliers
print("Ranked Multipliers based on Highest Positive Percentage Value:")
for rank, multiplier in enumerate(ranked_multipliers, start=1):
    percentage_value = highest_positive_percentage[multiplier]
    print(f"Rank {rank}: Multiplier - {multiplier}, Highest Positive Percentage Value - {percentage_value}")

In [None]:
# Initialize a list to store multipliers with positive profit at 16%
positive_profit_at_16 = []

# Iterate over each multiplier
for multiplier, profits in total_profits_two.items():
    # Check if profit is positive at 16%
    if profits[16] > 0:
        positive_profit_at_16.append(multiplier)

# Display the multipliers with positive profit at 16%
print("Multipliers with Positive Profit at 16%:")
for multiplier in positive_profit_at_16:
    print(f"Multiplier {multiplier} has positive profit at 16%.")

# Rank the multipliers based on the profit at 16%
ranked_multipliers_at_16 = sorted(positive_profit_at_16, key=lambda x: total_profits_one[x][16], reverse=True)

# Display the ranked multipliers
print("\nRanked Multipliers based on Profit at 16%:")
for rank, multiplier in enumerate(ranked_multipliers_at_16, start=1):
    profit_at_16 = total_profits_one[multiplier][16]
    print(f"Rank {rank}: Multiplier - {multiplier}, Profit at 16% - {profit_at_16}")


# Total Profit

In [None]:
import csv

# input_file = 'v2_success.log'
input_file = 'round3_std.log'
output_file = 'round3_std.csv'

# Set to True when inside the Activities log section
inside_activities_log = False

# Define the headers for the CSV file
headers = ['day', 'timestamp', 'product', 'bid_price_1', 'bid_volume_1', 'bid_price_2', 'bid_volume_2',
           'bid_price_3', 'bid_volume_3', 'ask_price_1', 'ask_volume_1', 'ask_price_2', 'ask_volume_2',
           'ask_price_3', 'ask_volume_3', 'mid_price', 'profit_and_loss']
count = 0

# Open the input log file and output CSV file
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    # Create a CSV writer object
    writer = csv.writer(outfile)
    
    # Write the headers to the CSV file
    writer.writerow(headers)
    
    # Read each line from the log file
    for line in infile:
        # Check if the line contains the start of the "Activities log" section
        if line.strip() == "Activities log:":
            inside_activities_log = True
            continue
        
        # Check if the line contains the end of the "Activities log" section
        elif line.strip() == "Trade History:" or line.strip() == "Sandbox logs:":
            inside_activities_log = False
            continue
        
        # Skip over empty lines
        elif not line.strip():
            continue
        
        # If inside the "Activities log" section, write the line to the CSV file
        elif inside_activities_log:
            if count > 0:
                # Split the line into fields based on the delimiter (;)
                fields = line.strip().split(';')

                # Write the fields to the CSV file
                writer.writerow(fields)
            count += 1

print("CSV file created successfully.")

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('round3_std.csv')

# Get unique product names
unique_products = df['product'].unique()

# Create an empty dictionary to store DataFrames for each product
product_dfs = {}

# Iterate over unique product names
for product in unique_products:
    # Filter the original DataFrame for the current product
    product_df = df[df['product'] == product].copy()
    # Reset index to avoid any index conflicts
    product_df.reset_index(drop=True, inplace=True)
    # Store the DataFrame in the dictionary with the product name as key
    product_dfs[product] = product_df

# Access DataFrames for individual products using their names
amethysts_df = product_dfs['AMETHYSTS']
starfruit_df = product_dfs['STARFRUIT']
orchids_df = product_dfs['ORCHIDS']
baskets_df = product_dfs['GIFT_BASKET']

In [None]:
# Calculate total profit for each commodity
total_profit_amethysts = amethysts_df['profit_and_loss'].sum()
total_profit_starfruit = starfruit_df['profit_and_loss'].sum()
total_profit_orchids = orchids_df['profit_and_loss'].sum()
total_profit_baskets = baskets_df['profit_and_loss'].sum()

# Calculate total profit
total_profit = total_profit_amethysts + total_profit_starfruit + total_profit_orchids + total_profit_baskets

print("Total profit for Amethysts:", total_profit_amethysts)
print("Total profit for Starfruit:", total_profit_starfruit)
print("Total profit for Orchids:", total_profit_orchids)
print("Total profit for Baskets:", total_profit_baskets)
print("Total profit for all commodities:", total_profit)