In [3]:
import pickle
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

Below is a dummy ML model built from linear regression:

In [2]:
X = np.random.rand(3,3)
y = np.random.rand(3,2)
print(X, y)

[[0.30092572 0.51830574 0.98478741]
 [0.93400601 0.31933276 0.08665472]
 [0.58195715 0.43230515 0.48727585]] [[0.21062504 0.58422818]
 [0.24512291 0.16850848]
 [0.20312982 0.78691849]]


In [3]:
model = LinearRegression()
model.fit(X,y)
model.predict(np.array([1,2,3]).reshape(1,-1))[0]

array([ 0.7243358 , -8.09688886])

In [4]:
picklefile = open('trained_model', 'wb')
#pickle the object and store it in a file
pickle.dump(model, picklefile)

In [5]:
#check that the object is correctly pickled and works when unpickled
del model
picklefile = open('trained_model', 'rb')
new_model = pickle.load(picklefile)
new_model.predict(np.array([1,2,3]).reshape(1,-1))

array([[ 0.7243358 , -8.09688886]])

# Price Pioneers Model

In [6]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Assuming 'train_pricing_decisions' is a CSV file in the 'data' folder
train_pricing_decisions = pd.read_csv('train_prices_decisions_2024.csv')
# Split the data into training and validation sets (70-30 split)
train_data, val_data = train_test_split(train_pricing_decisions, test_size=0.3, random_state=42)
# Import the actual testing dataset
test_user_info = pd.read_csv('test_user_info_2024.csv')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

X = train_pricing_decisions[['price_item', 'Covariate1', 'Covariate2', 'Covariate3']]
y = train_pricing_decisions['item_bought']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the train set
y_pred_rf_train = rf_model.predict(X_train)
y_pred_prob_rf_train = rf_model.predict_proba(X_train)[:, 1]

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
y_pred_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the Random Forest model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

print('test_set:', accuracy_rf, roc_auc_rf)

# Calculate the F1 scores
print("f1_score on train_set:", f1_score(y_train, y_pred_rf_train))
print("f1_score on test_set:", f1_score(y_test, y_pred_rf))

0.9778 0.9983919712623598
1.0 1.0
f1_score on train_set: 1.0
f1_score on test_set: 0.9780978689818469


In [60]:
import numpy as np
import pandas as pd

min_price_threshold = np.percentile(train_pricing_decisions['price_item'], 25)
prices_to_predict = np.arange(min_price_threshold, train_pricing_decisions['price_item'].max()+train_pricing_decisions['price_item'].mean(), 4)

# Check the result
print(len(prices_to_predict))
print(prices_to_predict.min())
print(prices_to_predict.max())

132
40.00028460826181
564.0002846082618


In [None]:
# This code generates revenue-maximizing prices for each customer in the test dataset by calculating the predicted demand for each price, 
# selecting the price that maximizes expected revenue, and assigning it to each customer.

# The code uses vectorized operations to process the entire dataset at once, avoiding explicit for loops by performing matrix manipulations 
# and element-wise operations to calculate and select the revenue-maximizing price for each customer.


import numpy as np
import pandas as pd

def predict_optimal_price(df, prices_to_predict, rf_model):
    """
    Predicts the optimal price for each customer in the test set based on the trained model.
    
    Parameters:
    - df: DataFrame containing the test data with covariates for each customer.
    - prices_to_predict: List or array of possible prices to predict.
    - rf_model: The trained Random Forest model used to predict the demand.

    Returns:
    - df: The test DataFrame with an additional column for the predicted price.
    """
    expanded_covariates = pd.DataFrame(np.tile(df[['Covariate1', 'Covariate2', 'Covariate3']].values, (len(prices_to_predict), 1)),
                                       columns=['Covariate1', 'Covariate2', 'Covariate3'])

    expanded_prices = np.repeat(prices_to_predict, len(df))
    
    expanded_data = pd.DataFrame({
        'price_item': expanded_prices,
        'Covariate1': expanded_covariates['Covariate1'],
        'Covariate2': expanded_covariates['Covariate2'],
        'Covariate3': expanded_covariates['Covariate3']
    })

    predictions = rf_model.predict_proba(expanded_data)[:, 1]  
    predictions_matrix = predictions.reshape(len(df), len(prices_to_predict))
    revenues_matrix = predictions_matrix * prices_to_predict
    max_revenue_prices = prices_to_predict[np.argmax(revenues_matrix, axis=1)]
    df['predicted_price'] = max_revenue_prices
    demand_prediction_df = pd.DataFrame(predictions_matrix, columns=prices_to_predict)
    
    return df, demand_prediction_df



Unnamed: 0,user_index,Covariate1,Covariate2,Covariate3,predicted_price
0,50000,0.124203,2.877793,12.969450,564.000285
1,50001,7.325681,1.082307,6.437379,556.000285
2,50002,3.935609,0.266604,5.913078,564.000285
3,50003,4.448206,1.531923,4.775433,560.000285
4,50004,5.133770,0.418046,10.326414,564.000285
...,...,...,...,...,...
49995,99995,4.585491,6.557460,6.702900,392.000285
49996,99996,0.642189,2.621895,5.493393,392.000285
49997,99997,7.008525,6.470774,12.204690,552.000285
49998,99998,5.753639,1.354022,8.556336,304.000285


In [95]:
train_df_with_prices, train_demand_predictions = predict_optimal_price(train_pricing_decisions, prices_to_predict, rf_model)
train_demand_predictions

Unnamed: 0,40.000285,44.000285,48.000285,52.000285,56.000285,60.000285,64.000285,68.000285,72.000285,76.000285,...,528.000285,532.000285,536.000285,540.000285,544.000285,548.000285,552.000285,556.000285,560.000285,564.000285
0,0.88,0.99,0.02,0.40,0.97,0.00,1.00,0.02,1.00,0.94,...,0.00,0.02,1.00,0.02,0.01,0.00,1.00,0.99,1.00,1.00
1,0.99,1.00,0.46,1.00,0.01,0.11,0.02,1.00,1.00,0.99,...,1.00,0.25,1.00,0.47,1.00,0.93,1.00,0.71,1.00,0.75
2,1.00,0.99,0.59,1.00,0.05,0.98,1.00,0.05,0.66,0.12,...,0.95,0.31,1.00,0.43,1.00,0.80,0.94,0.17,0.99,0.95
3,1.00,1.00,1.00,1.00,0.97,0.69,1.00,1.00,0.03,0.96,...,1.00,0.06,0.23,0.92,1.00,0.00,0.71,0.99,0.95,0.09
4,1.00,0.99,1.00,1.00,1.00,1.00,1.00,1.00,0.99,0.24,...,0.95,0.00,0.84,0.99,1.00,0.99,1.00,0.96,1.00,0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.00,0.00,0.00,0.00,0.02,0.00,0.00,0.00,0.01,0.08,...,0.34,0.03,0.24,0.01,0.00,0.01,0.00,0.00,0.00,0.00
49996,0.00,0.00,0.00,0.00,0.01,0.00,0.00,0.15,0.00,0.00,...,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.37,0.01,0.00
49997,0.00,0.00,0.00,0.07,0.00,0.00,0.00,0.00,0.02,0.22,...,0.00,0.00,0.00,0.24,0.00,0.00,0.00,0.00,0.00,0.03
49998,0.00,0.04,0.00,0.01,0.01,0.00,0.00,0.00,0.02,0.00,...,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [98]:
import numpy as np

def get_single_step_revenue_maximizing_price_and_revenue_k(Vtplus1k, Vtplus1kminus1, price_options, demand_predictions):
    price_options = np.array(price_options, dtype=np.float64)
    demand_predictions = np.array(demand_predictions, dtype=np.float64)
    rev_list = (price_options + Vtplus1kminus1) * demand_predictions + (1 - demand_predictions) * Vtplus1k
    opt_index = np.argmax(rev_list)
    return price_options[opt_index], rev_list[opt_index]

def get_prices_over_time_and_expected_revenue_k(prices, demand_predictions, T, K):
    prices = np.array(prices, dtype=np.float64)
    demand_predictions = np.array(demand_predictions, dtype=np.float64)
    opt_price_list = np.zeros((T, K + 1), dtype=np.float64)
    V = np.zeros((T + 1, K + 1), dtype=np.float64)

    for t in range(T - 1, -1, -1):
        V_t_k = V[t + 1, 1:]
        V_t_k_minus_1 = V[t + 1, :-1]
        rev_list = (prices + V_t_k_minus_1[:, None]) * demand_predictions + (1 - demand_predictions) * V_t_k[:, None]
        opt_index = np.argmax(rev_list, axis=1)
        opt_prices = prices[opt_index]
        max_values = np.max(rev_list, axis=1)
        V[t, 1:] = max_values
        opt_price_list[t, 1:] = opt_prices
    return opt_price_list, V





In [None]:
import numpy as np
import pandas as pd


opt_prices = []
for user in train_demand_predictions:
    opt_prices.append(get_prices_over_time_and_expected_revenue_k(prices_to_predict, user, T=20, K=12)[0])

training_opt_prices = np.array(opt_prices)[:,:,1:]

threshold_avg = pd.DataFrame(np.average(training_opt_prices, axis=0)).T

threshold_matrix_10percentile = []
for k in training_opt_prices.T:
    threshold_list = []
    for t in k:
        threshold_list.append(np.percentile(t, 10))
    threshold_matrix_10percentile.append(threshold_list)
threshold_10percentile = pd.DataFrame(threshold_matrix_10percentile)
threshold_10percentile[19] = 0

def threshold_func(opt_price, k, t):
    if opt_price < threshold_10percentile.iloc[k, 20-t]:
        return threshold_avg.iloc[k, 20-t]
    else:
        return opt_price


In [100]:
import pickle

with open('randomforrest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# Static_prices_submission.csv

In [97]:
test_df = pd.read_csv('test_user_info_2024.csv')
test_df_with_prices, test_demand_prediction = predict_optimal_price(test_df, prices_to_predict, rf_model)
test_df_with_prices

user_index = test_df['user_index']
predicted_price = test_df_with_prices['predicted_price']
price_index = np.array([np.where(prices_to_predict == price)[0][0] for price in predicted_price])
predicted_demand = test_demand_prediction.values[np.arange(len(test_demand_prediction)), price_index]
expected_revenue = predicted_demand * predicted_price

submission_df = pd.DataFrame({
    'user_index': user_index,
    'price_item': predicted_price,
    'expected_revenue': expected_revenue
})

submission_df.to_csv('static_prices_submission.csv', index=False)
