In [200]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm
import time

In [201]:
!pip3 install xgboost



In [202]:
df = pd.read_csv('data/train_prices_decisions.csv')

In [203]:
df

Unnamed: 0,user_index,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1,item_bought
0,0,0.783155,2.378872,8.342151,72.785084,37.007394,1
1,1,4.127142,4.024276,12.501464,51.414629,54.021070,-1
2,2,2.483091,8.986873,7.125810,31.885499,37.683472,0
3,3,1.534569,6.114015,6.885258,44.781299,61.870340,-1
4,4,5.599879,6.106288,7.010118,37.955789,63.114027,0
...,...,...,...,...,...,...,...
29995,29995,1.247246,0.695930,17.091256,57.999741,22.609823,1
29996,29996,8.917777,0.866071,5.220416,38.063565,39.093621,1
29997,29997,1.157068,2.337082,10.423779,73.827063,46.472392,1
29998,29998,2.293585,1.976625,8.266167,30.949918,53.437094,0


In [204]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)

In [205]:
train_df

Unnamed: 0,user_index,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1,item_bought
10303,10303,3.211214,1.594426,8.888642,39.489511,48.672286,1
9837,9837,4.805218,1.545974,6.642416,51.158962,45.747740,1
21833,21833,0.678090,0.180780,8.333445,53.016125,65.205344,1
11577,11577,0.739920,1.306557,7.599310,57.200833,45.593982,1
13646,13646,4.163777,5.626690,30.350818,40.597017,61.482608,0
...,...,...,...,...,...,...,...
27852,27852,4.104180,0.572869,6.999178,69.618467,52.900313,1
23605,23605,5.491053,4.093490,7.477795,7.722987,42.027973,0
1318,1318,0.120473,1.883329,4.668193,15.337509,50.362293,-1
25299,25299,2.908086,3.423101,10.497399,107.876954,92.213963,-1


In [206]:
test_df

Unnamed: 0,user_index,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1,item_bought
13125,13125,2.825197,4.324851,6.421547,78.079086,47.085734,-1
14635,14635,0.607966,4.604145,6.621343,88.450332,56.710216,-1
19429,19429,2.090662,5.603561,8.906568,78.012646,36.834364,1
4381,4381,7.543210,0.180546,7.012280,35.495591,41.859573,1
7659,7659,2.120467,0.222236,11.739922,63.233705,43.647200,1
...,...,...,...,...,...,...,...
27390,27390,0.040140,7.477506,8.439343,44.479095,40.646174,-1
19093,19093,2.177727,4.088750,4.886242,46.685612,39.655723,1
4235,4235,4.965604,0.021124,5.724033,44.307657,46.930956,1
23361,23361,0.107553,0.207441,3.961483,45.235673,63.958206,-1


In [207]:
train_df.to_csv('data/train_data.csv')
test_df.to_csv('data/test_data.csv')

In [208]:
X_train = train_df.drop(['item_bought','user_index'],axis = 1)
y_train = train_df['item_bought']
X_test = test_df.drop(['item_bought','user_index'], axis = 1)
y_test = test_df['item_bought']

y_train = np.where(y_train == -1, 2, y_train)
y_test = np.where(y_test == -1, 2, y_test)

In [209]:
X_train


Unnamed: 0,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1
10303,3.211214,1.594426,8.888642,39.489511,48.672286
9837,4.805218,1.545974,6.642416,51.158962,45.747740
21833,0.678090,0.180780,8.333445,53.016125,65.205344
11577,0.739920,1.306557,7.599310,57.200833,45.593982
13646,4.163777,5.626690,30.350818,40.597017,61.482608
...,...,...,...,...,...
27852,4.104180,0.572869,6.999178,69.618467,52.900313
23605,5.491053,4.093490,7.477795,7.722987,42.027973
1318,0.120473,1.883329,4.668193,15.337509,50.362293
25299,2.908086,3.423101,10.497399,107.876954,92.213963


In [431]:
model = xgb.XGBClassifier(objective='multi:softprob', num_class=3)


In [433]:
parameters = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [10, 20, 50],
    'colsample_bytree': [0.3, 0.7]
}


In [434]:
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=5, verbose=True)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [435]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50}
Best Score:  0.9455416666666668


In [436]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)


In [437]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)


0.9466666666666667


In [438]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      1137
           1       0.95      0.97      0.96      2982
           2       0.94      0.94      0.94      1881

    accuracy                           0.95      6000
   macro avg       0.94      0.94      0.94      6000
weighted avg       0.95      0.95      0.95      6000



In [470]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd


# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.9521666666666667

In [471]:
demand = model.predict_proba(X_test)
print(demand)
array_df = pd.DataFrame(demand, columns=['Prob1', 'Prob2','Prob3'])
result_df = pd.concat([pd.DataFrame(y_test,columns=['label']), array_df],axis = 1)
result_df

[[0.   0.01 0.99]
 [0.   0.01 0.99]
 [0.   0.71 0.29]
 ...
 [0.   1.   0.  ]
 [0.   0.01 0.99]
 [0.   0.   1.  ]]


Unnamed: 0,label,Prob1,Prob2,Prob3
0,2,0.00,0.01,0.99
1,2,0.00,0.01,0.99
2,1,0.00,0.71,0.29
3,1,0.01,0.99,0.00
4,1,0.00,1.00,0.00
...,...,...,...,...
5995,2,0.00,0.00,1.00
5996,1,0.01,0.82,0.17
5997,1,0.00,1.00,0.00
5998,2,0.00,0.01,0.99


In [439]:
demand = best_model.predict_proba(X_test)
print(demand)
array_df = pd.DataFrame(demand, columns=['Prob1', 'Prob2','Prob3'])
result_df = pd.concat([pd.DataFrame(y_test,columns=['label']), array_df],axis = 1)
result_df

[[0.02816985 0.18161516 0.790215  ]
 [0.01426836 0.03673916 0.94899255]
 [0.02636559 0.59711766 0.37651676]
 ...
 [0.01029388 0.9633928  0.02631336]
 [0.0181659  0.05962325 0.9222109 ]
 [0.0213912  0.15950346 0.8191054 ]]


Unnamed: 0,label,Prob1,Prob2,Prob3
0,2,0.028170,0.181615,0.790215
1,2,0.014268,0.036739,0.948993
2,1,0.026366,0.597118,0.376517
3,1,0.027520,0.962350,0.010130
4,1,0.007258,0.978983,0.013759
...,...,...,...,...
5995,2,0.003994,0.008910,0.987096
5996,1,0.034490,0.649400,0.316109
5997,1,0.010294,0.963393,0.026313
5998,2,0.018166,0.059623,0.922211


In [440]:
X_test.loc[[13125]]

Unnamed: 0,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1
13125,2.825197,4.324851,6.421547,78.079086,47.085734


In [441]:
best_model.save_model("best_model.xgb")



In [442]:
item0_max = (train_df['price_item_0'].max())
item0_min = (train_df['price_item_0'].min())
item1_max = (train_df['price_item_1'].max())
item1_min = (train_df['price_item_1'].min())
print(item0_min, item0_max, item1_min, item1_max)
item0_range = np.linspace(item0_min, item0_max, 10)
print(item0_range)

3.301688718503524 373.26491426696936 0.01 380.69698598475
[  3.30168872  44.40871378  85.51573884 126.6227639  167.72978896
 208.83681402 249.94383908 291.05086415 332.15788921 373.26491427]


In [443]:
new_df = pd.read_csv('data/test_user_info.csv')
new_df_index = new_df[['user_index']]
new_df = new_df.drop(['user_index'],axis =1)
new_df


Unnamed: 0,Covariate1,Covariate2,Covariate3
0,4.058168,9.823123,6.086879
1,6.423253,0.859018,8.990928
2,1.793184,0.704047,5.865989
3,3.040144,3.961137,13.536446
4,0.161440,1.738395,5.801892
...,...,...,...
29995,5.745474,2.170433,7.324661
29996,0.138668,0.499738,14.336209
29997,0.375095,3.356108,12.063857
29998,3.425891,4.192887,4.606790


In [444]:
new_df_index

Unnamed: 0,user_index
0,30000
1,30001
2,30002
3,30003
4,30004
...,...
29995,59995
29996,59996
29997,59997
29998,59998


In [445]:
len(new_df)

30000

In [480]:
item0_max = 90 
item0_min = 3
item1_max = 100
item1_min = 3

In [481]:
def predict_optimal(item, item0_max, item0_min, item1_max, item1_min):
    index = item.index[0]
    start_time = time.time()
    model = xgb.XGBClassifier(objective='multi:softprob', num_class=3)
    model.load_model('agents/UpsyDaisy/best_model.xgb')
    revenue_diff = np.inf
    old_revenue = 0
    new_revenue = 0
    price_pair = []
    item0_range = np.linspace(item0_min, item0_max, 10)
    item1_range = np.linspace(item1_min, item1_max, 10)
    max_revenue = 0
    for i in item0_range:
        for j in item1_range:
            item.loc[index, 'price_item_0'] = i
            item.loc[index, 'price_item_1'] = j
            demand = model.predict_proba(item)
            demand0 = demand[0][0]
            demand1 = demand[0][1]
            revenue = i * demand0 + j * demand1
            if revenue > max_revenue:
                max_revenue = revenue
                price_pair = [i,j]
    
    # print('Optimal Price Pair', price_pair)
    # print('Optimal Revenue', max_revenue) 
    new_revenue = max_revenue
    revenue_diff = new_revenue
    
    while revenue_diff >= 0.1:
        max_revenue = 0
        interval = item0_range[1] - item0_range[0]
        item0_range = np.linspace(price_pair[0] - interval/2, price_pair[0] + interval/2, 5)
        item1_range = np.linspace(price_pair[1] - interval/2, price_pair[1] + interval/2, 5)
        price_pair = []

        for i in item0_range:
            for j in item1_range:
                item.loc[index, 'price_item_0'] = i
                item.loc[index, 'price_item_1'] = j
                demand = model.predict_proba(item)
                demand0 = demand[0][0]
                demand1 = demand[0][1]
                revenue = i*demand0 + j * demand1
                if revenue > max_revenue:
                    max_revenue = revenue
                    price_pair = [i,j]
        old_revenue = new_revenue
        new_revenue = max_revenue
        revenue_diff = new_revenue - old_revenue
    end_time = time.time()
    # print('Running Time is', end_time-start_time)
    return price_pair, new_revenue
                

In [482]:
predict_optimal(new_df.iloc[[10000]], item0_max, item0_min, item1_max, item1_min)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item.loc[index, 'price_item_0'] = i
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item.loc[index, 'price_item_1'] = j
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item.loc[index, 'price_item_0'] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

([38.494791666666664, 38.58072916666667], 37.995663414398834)

In [483]:
def prediction(new_df, new_df_index, item0_max, item0_min, item1_max, item1_min):
    start_time = time.time()
    for i in tqdm(range(len(new_df))):
        item = new_df.loc[[i]]
        price_pair, new_revenue = predict_optimal(item, item0_max, item0_min, item1_max, item1_min)
        new_df_index.loc[i, 'price_item_0'] = price_pair[0]
        new_df_index.loc[i, 'price_item_1'] = price_pair[1]
        new_df_index.loc[i, 'expected_revenue'] = new_revenue
    end_time = time.time()
    print('Average Running Time is', (end_time - start_time)/len(new_df))
    return new_df_index

        
        

In [484]:
final_output2 = prediction(new_df, new_df_index, item0_max, item0_min, item1_max, item1_min)

 41%|████      | 12319/30000 [14:56<18:42, 15.75it/s] 

In [465]:
final_output2

Unnamed: 0,user_index,price_item_0,price_item_1,expected_revenue
0,30000,35.247396,35.371094,33.847392
1,30001,57.979167,70.687500,68.048716
2,30002,96.343750,62.326389,58.553060
3,30003,41.666667,46.111111,43.358268
4,30004,9.079427,12.342882,10.720489
...,...,...,...,...
995,30995,41.666667,46.111111,42.836929
996,30996,94.833333,35.333333,32.359254
997,30997,48.916667,55.076389,52.605246
998,30998,47.255208,50.566840,49.065927


In [469]:
final_output2.describe()

Unnamed: 0,user_index,price_item_0,price_item_1,expected_revenue
count,1000.0,1000.0,1000.0,1000.0
mean,30499.5,51.452883,51.374974,47.510472
std,288.819436,21.884063,18.256473,18.024781
min,30000.0,7.56901,8.944444,4.594621
25%,30249.75,38.494792,39.751302,36.854567
50%,30499.5,45.291667,47.508247,44.674067
75%,30749.25,58.904297,60.740451,56.669871
max,30999.0,96.419271,106.419271,102.37476


In [378]:
final_output

Unnamed: 0,user_index,price_item_0,price_item_1,expected_revenue
0,30000,26.420866,28.606212,26.648402
1,30001,396.355897,403.787969,51.862877
2,30002,48.211787,65.427731,60.252681
3,30003,99.218081,44.338531,42.672863
4,30004,7.574664,61.995425,5.239517
...,...,...,...,...
29995,59995,47.228126,62.891371,45.673282
29996,59996,38.093231,34.470589,17.394620
29997,59997,42.942620,41.970225,37.474166
29998,59998,396.384092,37.120836,35.923175


In [382]:
final_output.to_csv('part2_static_prices_submission.csv')

In [376]:
final_output.sum()['expected_revenue']

1659320.72732995

In [None]:
item = new_df.iloc[[0]]
item['price_item_0'] = 1
item['price_item_1'] = 2
item

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item['price_item_0'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item['price_item_1'] = 2


Unnamed: 0,user_index,Covariate1,Covariate2,Covariate3,price_item_0,price_item_1
0,30000,4.058168,9.823123,6.086879,1,2
