# Data Analysis

# CSV Creation

In [None]:
import csv

# input_file = 'v2_success.log'
input_file = 'round1.log'
output_file = 'round1.csv'

# Set to True when inside the Activities log section
inside_activities_log = False

# Define the headers for the CSV file
headers = ['day', 'timestamp', 'product', 'bid_price_1', 'bid_volume_1', 'bid_price_2', 'bid_volume_2',
           'bid_price_3', 'bid_volume_3', 'ask_price_1', 'ask_volume_1', 'ask_price_2', 'ask_volume_2',
           'ask_price_3', 'ask_volume_3', 'mid_price', 'profit_and_loss']
count = 0

# Open the input log file and output CSV file
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    # Create a CSV writer object
    writer = csv.writer(outfile)
    
    # Write the headers to the CSV file
    writer.writerow(headers)
    
    # Read each line from the log file
    for line in infile:
        # Check if the line contains the start of the "Activities log" section
        if line.strip() == "Activities log:":
            inside_activities_log = True
            continue
        
        # Check if the line contains the end of the "Activities log" section
        elif line.strip() == "Trade History:" or line.strip() == "Sandbox logs:":
            inside_activities_log = False
            continue
        
        # Skip over empty lines
        elif not line.strip():
            continue
        
        # If inside the "Activities log" section, write the line to the CSV file
        elif inside_activities_log:
            if count > 0:
                # Split the line into fields based on the delimiter (;)
                fields = line.strip().split(';')

                # Write the fields to the CSV file
                writer.writerow(fields)
            count += 1

print("CSV file created successfully.")

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('round1.csv')

# Get unique product names
unique_products = df['product'].unique()

# Create an empty dictionary to store DataFrames for each product
product_dfs = {}

# Iterate over unique product names
for product in unique_products:
    # Filter the original DataFrame for the current product
    product_df = df[df['product'] == product].copy()
    # Reset index to avoid any index conflicts
    product_df.reset_index(drop=True, inplace=True)
    # Store the DataFrame in the dictionary with the product name as key
    product_dfs[product] = product_df

# Access DataFrames for individual products using their names
amethysts_df = product_dfs['AMETHYSTS']
starfruit_df = product_dfs['STARFRUIT']

# Statistics for Amethysts

In [None]:
amethysts_stats = amethysts_df['mid_price'].describe()
print("Statistics for Amethysts:")
print(amethysts_stats)

In [None]:
# Calculate the range for each row
amethysts_df['bid_range'] = amethysts_df[['bid_price_1', 'bid_price_2', 'bid_price_3']].max(axis=1) - amethysts_df[['bid_price_1', 'bid_price_2', 'bid_price_3']].min(axis=1)

# Calculate the average range
average_range = amethysts_df['bid_range'].mean()

print("Average range between bid prices:", average_range)

In [None]:
# Assuming df is your DataFrame containing the mid prices
# Calculate the average mid price
avg_mid_price = amethysts_df['mid_price'].mean()

# Initialize counters
greater_than_avg_count = 0
lower_than_avg_count = 0

# Iterate over each mid price
for mid_price in amethysts_df['mid_price']:
    # Compare mid price with the average price
    if mid_price > avg_mid_price:
        greater_than_avg_count += 1
    elif mid_price < avg_mid_price:
        lower_than_avg_count += 1

print("Number of times mid price was greater than average:", greater_than_avg_count)
print("Number of times mid price was lower than average:", lower_than_avg_count)

# Statistics for Starfruit

In [None]:
starfruit_stats = starfruit_df['mid_price'].describe()
print("Statistics for Starfruit:")
print(starfruit_stats)

In [None]:
# Assuming df is your DataFrame containing the mid prices
# Calculate the average mid price
avg_mid_price = starfruit_df['mid_price'].mean()

# Initialize counters
greater_than_avg_count = 0
lower_than_avg_count = 0

# Iterate over each mid price
for mid_price in starfruit_df['mid_price']:
    # Compare mid price with the average price
    if mid_price > avg_mid_price:
        greater_than_avg_count += 1
    elif mid_price < avg_mid_price:
        lower_than_avg_count += 1

print("Number of times mid price was greater than average:", greater_than_avg_count)
print("Number of times mid price was lower than average:", lower_than_avg_count)

In [None]:
import pandas as pd
from collections import Counter

# Assuming df contains the DataFrame with the mid-price data for Starfruit

# Calculate the difference between consecutive mid-prices
starfruit_df['price_change'] = starfruit_df['mid_price'].diff()

# Initialize variables to track trend stretches
current_trend = None
start_timestamp = None
end_timestamp = None
trend_durations = []
trend_directions = []

# Iterate through the DataFrame
for index, row in starfruit_df.iterrows():
    # Check if the price change indicates a new trend
    if row['price_change'] > 0:
        if current_trend != 'increase':
            # If the previous trend was a decrease, end it and record the duration
            if current_trend is not None:
                end_timestamp = row['timestamp']
                trend_durations.append(end_timestamp - start_timestamp)
                trend_directions.append('decrease')
            # Start a new trend
            current_trend = 'increase'
            start_timestamp = row['timestamp']
    elif row['price_change'] < 0:
        if current_trend != 'decrease':
            # If the previous trend was an increase, end it and record the duration
            if current_trend is not None:
                end_timestamp = row['timestamp']
                trend_durations.append(end_timestamp - start_timestamp)
                trend_directions.append('increase')
            # Start a new trend
            current_trend = 'decrease'
            start_timestamp = row['timestamp']
trend_durations_counter = Counter(trend_durations)
total_timestamps = sum(trend_durations_counter.values())
print(len(trend_directions), len(trend_durations))

In [None]:
import pandas as pd

# Assuming starfruit_df contains the DataFrame with timestamps and mid-prices

# Calculate the mean and standard deviation of the mid-prices
mean_price = starfruit_df['mid_price'].mean()
std_price = starfruit_df['mid_price'].std()

# Initialize variables to track trend stretches
current_trend = None
start_timestamp = None
end_timestamp = None
trend_durations = []
trend_directions = []
count_high = 0
count_low = 0

# Iterate through the DataFrame
for index, row in starfruit_df.iterrows():
    # Check if the price is above the mean + std
    if row['mid_price'] > mean_price + std_price:
        count_high += 1
        if current_trend != 'above_mean':
            # If the previous trend was below mean, end it and record the duration
            if current_trend is not None:
                end_timestamp = row['timestamp']
                trend_durations.append(end_timestamp - start_timestamp)
                trend_directions.append('below_mean')
            # Start a new trend
            current_trend = 'above_mean'
            start_timestamp = row['timestamp']
    # Check if the price is below the mean - std
    elif row['mid_price'] < mean_price - std_price:
        count_low += 1
        if current_trend != 'below_mean':
            # If the previous trend was above mean, end it and record the duration
            if current_trend is not None:
                end_timestamp = row['timestamp']
                trend_durations.append(end_timestamp - start_timestamp)
                trend_directions.append('above_mean')
            # Start a new trend
            current_trend = 'below_mean'
            start_timestamp = row['timestamp']

# Check if the last trend continues until the end of the data
if current_trend is not None:
    end_timestamp = starfruit_df.iloc[-1]['timestamp']
    trend_durations.append(end_timestamp - start_timestamp)
    trend_directions.append(current_trend)
   


total_time = count_high + count_low
timestamps = [duration // 100 for duration in trend_durations]
time = sum(timestamps)
total_timestamps = starfruit_df.shape[0]
ratio = total_time / total_timestamps

# Print or analyze the trend durations and directions
print("Trend Durations:", trend_durations)
print("Trend Directions:", trend_directions)
print("Total Timestamps:", timestamps, total_timestamps, time, ratio)

# Linear Regression for Starfruit

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define a range of window sizes to test
window_sizes = range(1, 10)  # Test window sizes from 1 to 20

# Initialize variables to store results
results = []

# Iterate over window sizes
for window_size in window_sizes:
    # Create a dataframe with window_size columns containing the previous prices
    df = pd.DataFrame()
    for i in range(window_size):
        df[f'last_price_{i+1}'] = starfruit_df['mid_price'].shift(i+1)
    df['mid_price'] = starfruit_df['mid_price']  # Add the target variable

    # Drop rows with NaN values (due to shifting)
    df.dropna(inplace=True)

    # Prepare the data
    X = df.drop('mid_price', axis=1)
    y = df['mid_price']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate the model
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    
    intercept = model.intercept_
    coefficients = model.coef_
    print("Intercept:", intercept)
    print("Coefficient:", coefficients)
    equation = f"y = {intercept}"
    for i, coef in enumerate(coefficients):
        equation += f" + ({coef} * last_price_{i+1})"
    print("Equation:", equation)


    # Store the results
    results.append((window_size, train_rmse, test_rmse))

# Initialize variables to store the optimal window size and corresponding test RMSE
optimal_window_size = None
min_test_rmse = float('inf')

# Iterate over the results to find the optimal window size
for window_size, train_rmse, test_rmse in results:
    print("Window Size:", window_size)
    print("Train RMSE:", train_rmse)
    print("Test RMSE:", test_rmse)
    print()

    # Update the optimal window size if the current test RMSE is lower
    if test_rmse < min_test_rmse:
        optimal_window_size = window_size
        min_test_rmse = test_rmse

# Print the optimal window size and corresponding test RMSE
print("Optimal Window Size:", optimal_window_size)
print("Corresponding Test RMSE:", min_test_rmse)

In [None]:
# Prepare the data with a window size of 4
window_size = 4
df = pd.DataFrame()
for i in range(window_size):
    df[f'last_price_{i+1}'] = starfruit_df['mid_price'].shift(i+1)
df['mid_price'] = starfruit_df['mid_price']

# Drop rows with NaN values (due to shifting)
df.dropna(inplace=True)

# Prepare the data for training and testing
X = df.drop('mid_price', axis=1)
y = df['mid_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
test_preds = model.predict(X_test)

# Calculate the average absolute distance between predicted and actual prices
avg_abs_distance = np.mean(np.abs(test_preds - y_test))
print("Average Absolute Distance:", avg_abs_distance)

print(starfruit_df)
print(X)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Assuming 'X_train' and 'y_train' are your training features and target variable
# Assuming 'X_test' and 'y_test' are your testing features and target variable

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions for the entire dataset
all_preds = model.predict(X)

# Compute the differences between predicted and actual prices
differences = all_preds - y.values

# Create a DataFrame to store the differences with timestamps
differences_df = pd.DataFrame({'Timestamp': starfruit_df['timestamp'], 'Difference': differences})

# Print the DataFrame or perform further analysis
print(differences_df)


# Graphs

In [None]:
import matplotlib.pyplot as plt

# Plotting Amethysts
plt.figure(figsize=(10, 6))
plt.plot(amethysts_df['timestamp'], amethysts_df['mid_price'], label='Amethysts', color='blue')
plt.xlabel('Timestamp')
plt.ylabel('Mid Price')
plt.title('Mid Price of Amethysts over Time')
plt.legend()
plt.grid(True)
plt.show()

# Plotting Starfruit
plt.figure(figsize=(10, 6))
plt.plot(starfruit_df['timestamp'], starfruit_df['mid_price'], label='Starfruit', color='green')
plt.xlabel('Timestamp')
plt.ylabel('Mid Price')
plt.title('Mid Price of Starfruit over Time')
plt.legend()
plt.grid(True)
plt.show()


# Negatives

In [None]:
# Filter starfruit_df for negative profit and loss
negative_pl_df = starfruit_df[starfruit_df['profit_and_loss'] < 0]

# Extract timestamps from the filtered DataFrame
negative_pl_timestamps = negative_pl_df['timestamp']

# Print the timestamps with negative profit and loss
for row in starfruit_df['profit_and_loss']:
    print(row)
print("Timestamps with negative profit and loss:")
print(negative_pl_timestamps)

# Total Profits

In [None]:
# Calculate total profit for each commodity
total_profit_amethysts = amethysts_df['profit_and_loss'].sum()
total_profit_starfruit = starfruit_df['profit_and_loss'].sum()

# Calculate total profit
total_profit = total_profit_amethysts + total_profit_starfruit

print("Total profit for Amethysts:", total_profit_amethysts)
print("Total profit for Starfruit:", total_profit_starfruit)
print("Total profit for both commodities:", total_profit)

# Manual Trading

In [None]:
import matplotlib.pyplot as plt

def calculate_profit(low_bid, high_bid, distribution):
    profit = distribution[low_bid] * (1000 - low_bid) + (distribution[high_bid] - distribution[low_bid]) * (1000 - high_bid)
    return profit

def create_probability_distribution(lower_bound, upper_bound):
    distribution = {}
    for bid in range(lower_bound, upper_bound + 1):
        # Calculate the number of goldfish willing to purchase
        # This is assumed to increase linearly from 1 to the number of bids
        num_goldfish = bid - lower_bound + 1
        distribution[bid] = num_goldfish
    return distribution


def find_optimal_bids(distribution):
    max_profit = 0
    optimal_low_bid = 900
    optimal_high_bid = 900
    
    for low_bid in range(900, 1001):
        for high_bid in range(low_bid + 1, 1001):
            profit = calculate_profit(low_bid, high_bid, distribution)
            print(low_bid, high_bid, profit)
            if profit > max_profit:
                max_profit = profit
                optimal_low_bid = low_bid
                optimal_high_bid = high_bid
    
    return optimal_low_bid, optimal_high_bid, max_profit

def plot_results(optimal_low_bid, optimal_high_bid, max_profit):
    plt.figure(figsize=(10, 6))
    plt.plot([optimal_low_bid, optimal_low_bid], [0, max_profit], color='blue', linestyle='--', label='Optimal Low Bid')
    plt.plot([optimal_high_bid, optimal_high_bid], [0, max_profit], color='green', linestyle='--', label='Optimal High Bid')
    plt.scatter(optimal_low_bid, max_profit, color='blue', marker='o', label='Optimal Low Bid')
    plt.scatter(optimal_high_bid, max_profit, color='green', marker='o', label='Optimal High Bid')
    plt.xlabel('Bid Price')
    plt.ylabel('Profit')
    plt.title('Optimal Bids and Profit')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
lower_bound = 900
upper_bound = 1000
distribution = create_probability_distribution(lower_bound, upper_bound)
optimal_low_bid, optimal_high_bid, max_profit = find_optimal_bids(distribution)
print("Distribution: ", distribution)
print("Optimal Low Bid:", optimal_low_bid)
print("Optimal High Bid:", optimal_high_bid)
print("Max Profit:", max_profit)

plot_results(optimal_low_bid, optimal_high_bid, max_profit)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def linear_pdf(x):
    # Define the range of reserve prices
    lower_bound = 900
    upper_bound = 1000
    
    # Calculate slope and intercept for the linear function
    slope = float(1 / (upper_bound - lower_bound))
    intercept = -slope * lower_bound
    
    # Linear PDF function
    pdf = slope * x + intercept
    
    # Normalize the PDF
    normalization_constant = np.trapz(pdf, x)
    pdf /= normalization_constant
    
    return pdf

# Generate x values from 900 to 1000
x_values = np.arange(900, 1001)

# Calculate the PDF values
pdf_values = linear_pdf(x_values)
print(x_values)
print(pdf_values, sum(pdf_values))
# Plot the distribution
plt.plot(x_values, pdf_values)
plt.xlabel('Reserve Price')
plt.ylabel('Probability Density')
plt.title('Linear Probability Density Function')
plt.grid(True)
plt.show()

In [None]:
linear_pdf_dict = dict(zip(x_values, pdf_values))
print(linear_pdf_dict)
lower_bound = 900
upper_bound = 1000
distribution = create_probability_distribution(lower_bound, upper_bound)
optimal_low_bid, optimal_high_bid, max_profit = find_optimal_bids(linear_pdf_dict)
print("Optimal Low Bid:", optimal_low_bid)
print("Optimal High Bid:", optimal_high_bid)
print("Max Profit:", max_profit)

plot_results(optimal_low_bid, optimal_high_bid, max_profit)