In [3]:
import pickle
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

Below is a dummy ML model built from linear regression:

In [2]:
X = np.random.rand(3,3)
y = np.random.rand(3,2)
print(X, y)

[[0.30092572 0.51830574 0.98478741]
 [0.93400601 0.31933276 0.08665472]
 [0.58195715 0.43230515 0.48727585]] [[0.21062504 0.58422818]
 [0.24512291 0.16850848]
 [0.20312982 0.78691849]]


In [3]:
model = LinearRegression()
model.fit(X,y)
model.predict(np.array([1,2,3]).reshape(1,-1))[0]

array([ 0.7243358 , -8.09688886])

In [4]:
picklefile = open('trained_model', 'wb')
#pickle the object and store it in a file
pickle.dump(model, picklefile)

In [5]:
#check that the object is correctly pickled and works when unpickled
del model
picklefile = open('trained_model', 'rb')
new_model = pickle.load(picklefile)
new_model.predict(np.array([1,2,3]).reshape(1,-1))

array([[ 0.7243358 , -8.09688886]])

# Price Pioneers Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming 'train_pricing_decisions' is a CSV file in the 'data' folder
train_pricing_decisions = pd.read_csv('train_prices_decisions_2024.csv')
# Split the data into training and validation sets (70-30 split)
train_data, val_data = train_test_split(train_pricing_decisions, test_size=0.3, random_state=42)
# Import the actual testing dataset
test_user_info = pd.read_csv('test_user_info_2024.csv')

#Creating prices_to_predict array based on min and max prices in train_pricing_decisions
print('min: ', np.round(train_pricing_decisions.price_item.min(), 2), 
      'max: ', np.round(train_pricing_decisions.price_item.max(), 2))
prices_to_predict = np.arange(train_pricing_decisions.price_item.min(),  train_pricing_decisions.price_item.max(), 4)


min_price = train_pricing_decisions.price_item.min()
max_price = train_pricing_decisions.price_item.max()

# Generate more points near the center
n_center = 120  # Number of central points
center_points = np.linspace(min_price + (max_price - min_price) * 0.1,  # Start 10% above min
                             max_price - (max_price - min_price) * 0.1,  # End 10% below max
                             n_center)

# Generate fewer points near the edges
n_edges = 20  # Number of edge points
edge_points = np.concatenate([
    np.linspace(min_price, min_price + (max_price - min_price) * 0.1, n_edges // 2),
    np.linspace(max_price - (max_price - min_price) * 0.1, max_price, n_edges // 2)
])

# Combine and sort the points
prices_to_predict = np.sort(np.concatenate([center_points, edge_points]))

print(prices_to_predict)

print(min(prices_to_predict), max(prices_to_predict))



In [None]:
# Cluster the users using K-mean to segment the customer base
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Cluster the users using K-means to segment the customer base
def cluster(data, n_clusters):
    """
    Perform K-Means clustering on the covariates in the dataset.

    Parameters:
        data (pd.DataFrame): The dataset containing covariates.
        n_clusters (int): Number of clusters to form. Default is 3.

    Returns:
        pd.Series: A pandas Series containing the cluster labels for each row in the data.
    """
    # Extract the covariate columns for clustering
    covariates = data[['Covariate1', 'Covariate2', 'Covariate3']]
    
    # Standardize the covariates to ensure equal contribution of features
    scaler = StandardScaler()
    covariates_scaled = scaler.fit_transform(covariates)
    
    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(covariates_scaled)
    
    # Add the cluster labels back to the original DataFrame
    data['cluster'] = cluster_labels
    
    return data, kmeans


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Pick the optimal cluster number by using elbow method on the training set
scaler = StandardScaler()
data_scaled = scaler.fit_transform(train_data[['Covariate1', 'Covariate2', 'Covariate3']])

# Compute inertia for different numbers of clusters
inertias = []
cluster_range = range(1, 11)  # Test k from 1 to 10

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_scaled)
    inertias.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertias, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
# k-mean clustering using K=4 for customer segmentation ***** USING ENTIRE DATASET, TRAIN + VALIDATION *****
segmented_train, kmeans_model = cluster(train_pricing_decisions, 4)
segmented_train.head()

In [None]:
# Creating logistic regression function to predict demand based on training_pricing_decisions and prices_to_predict
from sklearn.linear_model import LogisticRegression
def fit_logistic_regression_demand_with_covariates(df):
    model = LogisticRegression(fit_intercept=True)
    X = df[['price_item', 'Covariate1', 'Covariate2', 'Covariate3']]
    y = df['item_bought'].astype(int)  
    model.fit(X, y)
    return model

# model = fit_logistic_regression_demand_with_covariates(train_pricing_decisions)

def get_prediction_logistic(fitted_model, price, covariates):
    input_data = pd.DataFrame({
        'price_item': [price],     
        'Covariate1': [covariates[0]], 
        'Covariate2': [covariates[1]],
        'Covariate3': [covariates[2]]
    })
    prediction = fitted_model.predict_proba(input_data)[:, 1]  
    return prediction[0]

In [None]:
# Initialize a list to store results for each cluster
models = []

# Loop through unique cluster values
for cluster in range(4):
    # Filter the DataFrame for the current cluster
    cluster_df = segmented_train[segmented_train['cluster'] == cluster]
    
    model = fit_logistic_regression_demand_with_covariates(cluster_df)
    models.append(model)

#### Validating Cluster Assigments with validation set

In [None]:
from sklearn.metrics import f1_score
# the function takes in dataframe of covariates(df), list of prices(prices_to_predict), and list of segmented logistic regression models(cluster_log_models)
#  --> outputs f1 score
def get_f1(df, cluster_log_models, typ, kmeans_model=kmeans_model):
    # Getting predictions for train_pricing_decision to get average price
    buy_predictions = []
    for row in df.itertuples(index=False, name='Pandas'):
        # Extract the relevant covariates using their field names
        covariates = [row.Covariate1, row.Covariate2, row.Covariate3]
        price = row.price_item
        assigned_cluster = kmeans_model.predict([covariates])[0]
    
        input_data = pd.DataFrame({
        'price_item': [price],     
        'Covariate1': [covariates[0]], 
        'Covariate2': [covariates[1]],
        'Covariate3': [covariates[2]]
        })
        buy_pred = cluster_log_models[assigned_cluster].predict(input_data)
        buy_predictions.append(buy_pred)

    # Calculate F1 score
    f1 = f1_score(df["item_bought"], buy_predictions, average='weighted')  # Use 'weighted' for imbalanced datasets
    print(f"F1 Score {typ}: {f1}")

    return f1

In [None]:
# get f1 for validation dataset
f1_val = get_f1(val_data, models, "validation", kmeans_model=kmeans_model)
f1_train = get_f1(train_data, models, "train", kmeans_model=kmeans_model)


#### Get Demand Predictions, using our clustering

In [None]:
# the function takes in dataframe of covariates(df), list of prices(prices_to_predict), and list of segmented logistic regression models(cluster_log_models)
#  --> outputs demand predictions
def get_demand_predictions_clusters(df, prices_to_predict, cluster_log_models, kmeans_model=kmeans_model):
    # Getting predictions for train_pricing_decision to get average price
    demand_predictions = []
    i = 0
    for row in df.itertuples(index=False, name='Pandas'):
        # Extract the relevant covariates using their field names
        covariates = [[row.Covariate1, row.Covariate2, row.Covariate3]]
        assigned_cluster = kmeans_model.predict(covariates)[0]

        demand_prediction = []
        for price in prices_to_predict:
            demand_prediction.append(get_prediction_logistic(cluster_log_models[assigned_cluster], price, [row.Covariate1, row.Covariate2, row.Covariate3]))
        i += 1
        if i % 5000 == 0:
            print(i)    
        
        demand_predictions.append(demand_prediction)
    
    return demand_predictions

In [None]:
def get_demand_prediction_clusters(df, prices_to_predict, cluster_log_models, kmeans_model=kmeans_model):    
    covariates = [[df.Covariate1, df.Covariate2, df.Covariate3]]
    
    assigned_cluster = kmeans_model.predict(covariates)[0]
    demand_prediction = []
    for price in prices_to_predict:
        demand_prediction.append(get_prediction_logistic(cluster_log_models[assigned_cluster], price, [df.Covariate1, df.Covariate2, df.Covariate3]))

    return demand_prediction    
    

In [None]:
# get demand predictions for training dataset
train_demand_predictions = get_demand_prediction_clusters(train_pricing_decisions.iloc[1000,:], prices_to_predict, models, kmeans_model=kmeans_model)

In [None]:
import matplotlib.pyplot as plt
plt.plot(prices_to_predict, train_demand_predictions);

# Adding labels and title
plt.xlabel("Price")
plt.ylabel("Probability of Purchase")
plt.title("Price vs Purchase Probability")

In [None]:
def dynamic_program(prices_to_predict, demand_prediction, T, K):
    demand_pred = np.array(demand_prediction)
    ratio = K/T
    if ratio>=0.9:
        ratio = 0.9
    diff = np.abs(demand_pred - ratio)
    return prices_to_predict[np.argmin(diff)]

In [None]:
def get_single_step_revenue_maximizing_price_and_revenue_k(Vtplus1k, Vtplus1kminus1, price_options, demand_predictions):
    rev_list = (np.array(price_options)+Vtplus1kminus1*np.ones(len(price_options)))*np.array(demand_predictions)+(np.ones(len(demand_predictions))-demand_predictions)*Vtplus1k
    opt_index = np.argmax(rev_list)
    Ptk = price_options[opt_index]
    # print(len(rev_list), type(rev_list), opt_index)
    # print(len(price_options), type(price_options), opt_index)
    vtk = rev_list[opt_index]
    return Ptk, vtk

def get_prices_over_time_and_expected_revenue_k(prices, demand_predictions, T, K):
    opt_price_list=np.zeros([T,K+1])
    V = np.zeros([T+1,K+1])
    for t in range(T - 1, -1, -1):
        for k in range(1, K + 1):  # We cannot sell if k = 0
            # Optimize the price given the future value function
            V_t_k = V[t + 1][k]
            V_t_k_minus_1 = V[t + 1][k - 1] if k > 0 else None
            opt_price, max_value = get_single_step_revenue_maximizing_price_and_revenue_k(V_t_k, V_t_k_minus_1, prices, demand_predictions)
            V[t][k] = max_value  # Update the value function
            opt_price_list[t][k] = opt_price  # Store the optimal price for time t and k items left   
    return opt_price_list, V

#### Create thresholds based off of segmented training data (1 for each cluster)

# Optimal training prices for all t<20, k<=12
opt_prices = []
for user in train_demand_predictions:
    opt_prices.append(get_prices_over_time_and_expected_revenue_k(prices_to_predict, user, T=20, K=12)[0])

# Getting rid of the 0 s.
training_opt_prices = np.array(opt_prices)[:,:,1:]

# Calculating the average to charge the people with less willingness to pay
threshold_avg = pd.DataFrame(np.average(training_opt_prices, axis=0)).T

# The cutoff for people that has low willingness to pay using the 1th percentile of the training data across all the users
threshold_matrix_1percentile = []
for k in training_opt_prices.T:
    threshold_list = []
    for t in k:
        threshold_list.append(np.percentile(t, 1))
    threshold_matrix_1percentile.append(threshold_list)
threshold_1percentile = pd.DataFrame(threshold_matrix_1percentile)
# threshold_1percentile[19] = 0
k = threshold_1percentile.shape[0] # number of inventory
t = threshold_1percentile.shape[1] # number of customers

# Create a mask with the same shape as the DataFrame
mask = np.zeros((k, t), dtype=bool)

# Populate the mask where row index <= column index
for i in range(k):
    for j in range(t):
        if i <= j - 8:
            mask[i, j] = True

# Set the selected entries to 0
threshold_1percentile[mask] = 0

threshold_1percentile

In [None]:
# Ensure train_demand_predictions is converted to a DataFrame with headers from prices_to_predict
train_demand_predictions_df = pd.DataFrame(train_demand_predictions, columns=prices_to_predict)
print(train_demand_predictions_df.shape)
# Concatenate the DataFrame with segmented_train
# segmented_train = pd.concat([segmented_train.reset_index(drop=True), train_demand_predictions_df.reset_index(drop=True)], axis=1)

# Verify the result
print(segmented_train.shape)  # Should have 50,000 rows and original + new columns
# print(segmented_train.head())  # Display the first few rows


# Separate the data by cluster and calculate thresholds for each cluster
cluster_thresholds = {}  # Dictionary to store threshold dataframes for each cluster
cluster_replacements = {}  # Dictionary to store replacment dataframes for each cluster (if customer below threshold, give them this)

for cluster_id in [0, 1, 2, 3]:
    # Filter the users in the current cluster
    cluster_users = segmented_train[segmented_train['cluster'] == cluster_id]

    # Optimal training prices for all t<20, k<=12 for the current cluster
    opt_prices = []
    for _, user in cluster_users.iterrows():  # Iterate over rows
        # Create a copy of the user row excluding the first 7 columns
        user_data = user.iloc[7:].copy()
        # print(len(np.array(user_data)))
        # print(np.array(user_data))
        # print(len(prices_to_predict))
        # print(prices_to_predict)
        
        # Use this user data to compute optimal prices
        opt_prices.append(get_prices_over_time_and_expected_revenue_k(prices_to_predict, np.array(user_data), T=20, K=12)[0])

    # Getting rid of the 0s
    training_opt_prices = np.array(opt_prices)[:,:,1:]

    # Calculating the average to charge people with less willingness to pay
    threshold_avg = pd.DataFrame(np.average(training_opt_prices, axis=0)).T

    # The cutoff for people with low willingness to pay using the 10th percentile of the training data across all users in the cluster
    threshold_matrix_1percentile = []
    replacment_matrix_65percentile = []

    for k in training_opt_prices.T:
        threshold_list = []
        replacement_list = []
        for t in k:
            threshold_list.append(np.percentile(t, 1))
            replacement_list.append(np.percentile(t, 65))
        threshold_matrix_1percentile.append(threshold_list)
        replacment_matrix_65percentile.append(replacement_list)

    threshold_1percentile = pd.DataFrame(threshold_matrix_1percentile)
    k = threshold_1percentile.shape[0] # number of inventory
    t = threshold_1percentile.shape[1] # number of customers

    # Create a mask with the same shape as the DataFrame
    mask = np.zeros((k, t), dtype=bool)

    # Populate the mask where row index <= column index
    for i in range(k):
        for j in range(t):
            if i <= j - 8:
                mask[i, j] = True

    # Set the selected entries to 0
    threshold_1percentile[mask] = 0

    # k<=t
    # threshold_10percentile[19] = 0
    # threshold_10percentile[18] = 0

    replacment_65percentile = pd.DataFrame(replacment_matrix_65percentile)

    # Store the threshold dataframe for the current cluster
    cluster_thresholds[cluster_id] = threshold_1percentile
    cluster_replacements[cluster_id] = replacment_65percentile

# Access the threshold and replacement dataframes for each cluster
for cluster_id in [0, 1, 2, 3]:
    threshold_df = cluster_thresholds[cluster_id]
    replacement_df = cluster_replacements[cluster_id]
    print(f"Threshold DataFrame for Cluster {cluster_id}:\n", threshold_df)
    print(f"Replacement DataFrame for Cluster {cluster_id}:\n", replacement_df)

In [None]:
threshold_1percentile.to_csv('threshold_1percentile.csv')

# threshold_avg.to_csv('threshold_avg.csv')

In [None]:
cluster_thresholds[0].to_csv('threshold_10percentile0.csv')
cluster_replacements[0].to_csv('threshold_replacement0.csv')

cluster_thresholds[1].to_csv('threshold_10percentile1.csv')
cluster_replacements[1].to_csv('threshold_replacement1.csv')

cluster_thresholds[2].to_csv('threshold_10percentile2.csv')
cluster_replacements[2].to_csv('threshold_replacement2.csv')

cluster_thresholds[3].to_csv('threshold_10percentile3.csv')
cluster_replacements[3].to_csv('threshold_replacement3.csv')

In [None]:
def threshold_func(opt_price, t, k, threshold, replacement):
    if opt_price < threshold.iloc[k-1, 20-t]:
        return replacement.iloc[k-1, 20-t]
    else:
        return opt_price

In [None]:
#Serialized our logistic regression models that predicts demand probabilities across many prices (prices_to_predict) for 4 different clusters
import pickle

with open('demand_logistic_reg_kmeans_0.pkl', 'wb') as f:
    pickle.dump(models[0], f)
with open('demand_logistic_reg_kmeans_1.pkl', 'wb') as f:
    pickle.dump(models[1], f)
with open('demand_logistic_reg_kmeans_2.pkl', 'wb') as f:
    pickle.dump(models[2], f)
with open('demand_logistic_reg_kmeans_3.pkl', 'wb') as f:
    pickle.dump(models[3], f)

# pickle kmeans file
with open('kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans_model, f)

# with open('demand_logistic_reg.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)



In [None]:


'''  This function is called every time the agent needs to choose an action by the environment.

The input 'obs' is a 5 tuple, containing the following information:
-- new_buyer_covariates: a vector of length 3, containing the covariates of the new buyer.
-- last_sale: a tuple of length 2. The first element is the index of the agent that made the last sale, if it is NaN, then the customer did not make a purchase. 
    The second element is a numpy array of length n_agents, containing the prices that were offered by each agent in the last sale.
-- state: a vector of length n_agents, containing the current profit of each agent.
-- inventories: a vector of length n_agents, containing the current inventory level of each agent.
-- time_until_replenish: an integer indicating the time until the next replenishment, by which time your (and your opponent's, in part 2) remaining inventory will be reset to the inventory limit.

The expected output is a single number, indicating the price that you would post for the new buyer.
      
'''

def get_single_user_price(new_buyer_covariates, last_sale, state, inventories, time_until_replenish):
    with open('demand_logistic_reg.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    demand_prediction = []
    for price in prices_to_predict:
        demand_prediction.append(get_prediction_logistic(loaded_model, price, new_buyer_covariates))
    opt_price =  get_prices_over_time_and_expected_revenue_k(prices_to_predict, demand_prediction, T=20-time_until_replenish, K=inventories)[0][0][-1]
    opt_price = threshold_func(opt_price, 20-time_until_replenish, K=inventories)
    return opt_price

In [None]:
import matplotlib.pyplot as plt
plt.hist(training_opt_prices[training_opt_prices<=150].flatten(), bins = 100)

In [None]:
#Testing final function with inputs from pricepioneers.py/agent function
get_single_user_price([0.3, 0.4, 0.5], last_sale=0, state=0, inventories=12, time_until_replenish=20)

# Creating Static CSV of predicted price for test_set

# #Getting demand prdictions for test_user_info to feed into the bellman equations
# # demand_predictions: Each row represents a user and each column represents a price
# demand_predictions = []
# for row in test_user_info.itertuples(index=False, name='Pandas'):
#     demand_prediction = []
#     for price in prices_to_predict:
#         demand_prediction.append(get_prediction_logistic(model, price, [row.Covariate1, row.Covariate2, row.Covariate3]))
#     demand_predictions.append(demand_prediction)
# print('DONE')

In [None]:
# Get demand predictions for test dataset
test_demand_predictions = get_demand_predictions_clusters(test_user_info, prices_to_predict, models, kmeans_model=kmeans_model)

In [None]:

def get_single_step_revenue_maximizing_price_and_revenue(Vtplus1, price_options, demand_predictions):
    max_value = float('-inf')
    optimal_price = None
    for price, demand in zip(price_options, demand_predictions):
        expected_revenue = demand * price + (1 - demand) * Vtplus1

        if expected_revenue > max_value:
            max_value = expected_revenue
            optimal_price = price

    return optimal_price*0.95, max_value*0.95


optimal_prices = []
optimal_rev = []
for user in test_demand_predictions:
    price, rev = get_single_step_revenue_maximizing_price_and_revenue(0, prices_to_predict, user)
    optimal_prices.append(price)
    optimal_rev.append(rev)


In [None]:
#Check
len(optimal_rev) == len(optimal_prices) == len(test_user_info)

In [None]:
# save the optimal prices into csv for test_user_info
dict = {'user_index': test_user_info.user_index, 'price_item': optimal_prices, 'expected_revenue':optimal_rev}
df = pd.DataFrame(dict)
df.to_csv('static_prices_submission.csv', index=False)

In [None]:
'''  
import matplotlib.pyplot as plt
# Plot each row as a separate line
for i in range(data.shape[0]):
    plt.plot(data[i, :], marker='o', label=f'Row {i+1}')  # Plot each row as a line

# Add labels and title
plt.xlabel('T')
plt.ylabel('Optimal Price')
plt.title('Line Plot of Rows in a 5x10 NumPy Array')

# Add legend to identify each line
plt.legend()

# Display the grid for better readability
plt.grid(True)

# Show the plot
plt.show()
'''
