# 1. Loading Data

In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

dataset_path = "../input/real-time-advertisers-auction/Dataset.csv"
ascendeum_data = pd.read_csv(dataset_path)
ascendeum_data.head()

# 2. Analyse Data

In [None]:
X_orig = ascendeum_data.copy()
ascendeum_data.describe()

In [None]:
ascendeum_data.isnull().sum()

In [None]:
ascendeum_data.nunique()

In [None]:
ascendeum_data.info()

----------------------------------------------------------------------------------------------------------------
### Inferences from analysing input data:
     1. There are 17 columns - 1 date time column, 15 Integer columns and 1 Float column
     2. Nearly 8 coulmns are having less than 10 unique items, so they can be considered as categorical columns
     3. There are no missign values in any of the given columns
     4. On average, there is a 0.069 Revenue generate for 33.67 Impressions
     5. Target, which is CPM need to calculated

# 3. Feature Engineering

### CPM calculation and injesting to the dataset

CPM – cost per Mille. It is Calculated as revenue/impressions * 1000. 'bids' and 'price' are measured in terms of CPM.

In [None]:
def CPM(revenue, impressions):
    return revenue / impressions if impressions else 0

ascendeum_data['CPM'] = ascendeum_data.apply(lambda x: CPM(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)
ascendeum_data['CPM'].describe()

### Finding Correlation (HeatMap)

In [None]:
corr = ascendeum_data.corr()
plt.figure(figsize=(18,9))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="RdYlGn",square=True, annot=True)
plt.show()

----------------------------------------------------------------------------------------------------------------
### Inferences from correlation analysis:
     1. 'integration_type_id' and 'revenue_share_percent' can be dropped as they have constant values through out
     2. 'measurable_impressions' and 'total_revenue' can be dropped as they are highly correlated with 'total_impressions'

In [None]:
ascendeum_data= ascendeum_data.drop(['integration_type_id', 'revenue_share_percent', 
                                     'measurable_impressions', 'total_revenue'], axis = 1)
ascendeum_data.info()

In [None]:
corr = ascendeum_data.corr()
plt.figure(figsize=(14,8))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="RdYlGn",square=True, annot=True)
plt.show()

----------------------------------------------------------------------------------------------------------------
### Handling Outliers

In [None]:
sns.distplot(ascendeum_data["CPM"])

Remove the extremes/outliers from CPM. 95% of the data is within 2 standard deviations.

In [None]:
ascendeum_data = ascendeum_data[ascendeum_data['CPM'].between(ascendeum_data['CPM'].quantile(.05), ascendeum_data['CPM'].quantile(.95))]
sns.boxplot(ascendeum_data["CPM"],color="green")

In [None]:
sns.distplot(ascendeum_data["CPM"])

In [None]:
ascendeum_data.shape

In [None]:
ascendeum_data['date'] =  pd.to_datetime(ascendeum_data['date'])
ascendeum_data['weekday'] = ascendeum_data['date'].dt.dayofweek

Created new column "dayofweek" from date to include the effect of date.

In [None]:
y = ascendeum_data.CPM
X = ascendeum_data.drop(['CPM', 'date'], axis = 1)

# 4. Modelling

### Split Dataset (Train, Validation)

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

### 4.1 Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

DTR_model = DecisionTreeRegressor( max_leaf_nodes =1000, random_state=0)

DTR_model.fit(train_X, train_y)

val_predictions = DTR_model.predict(val_X)
print("MAE:", mean_absolute_error(val_y, val_predictions))
print("MSE:", mean_squared_error(val_y, val_predictions))

#### Iterating through various "Max leaf nodes" values for lowest MSE and MAE.

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    mse = mean_squared_error(val_y, preds_val)
    return(mae, mse)

for max_leaf_nodes in [5, 50, 500, 1000, 2000, 5000]:
    my_mae, my_mse = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t MAE: %d \t\t MSE: %d" %(max_leaf_nodes, my_mae, my_mse))

### 4.2 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
for_preds = forest_model.predict(val_X)

print("MAE:", mean_absolute_error(val_y, for_preds))
print("MSE:", mean_squared_error(val_y, for_preds))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    mse = mean_squared_error(val_y, preds_val)
    return(mae, mse)

for max_leaf_nodes in [5, 50, 500, 1000, 2000, 5000]:
    my_mae, my_mse = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t MAE: %d \t\t MSE: %d" %(max_leaf_nodes, my_mae, my_mse))

### 4.3 LGBM Regressor

In [None]:
import xgboost as xgb
from xgboost import plot_importance
import lightgbm as lgb
from catboost import CatBoostRegressor as cbr

model_lgb = lgb.LGBMRegressor(num_leaves=41, n_estimators=200)
model_lgb.fit(train_X, train_y)
lgb_preds = model_lgb.predict(val_X)

print("MAE:", mean_absolute_error(val_y, lgb_preds))
print("MSE:", mean_squared_error(val_y, lgb_preds))

### 4.4 XGB Regressor

In [None]:
model_xgb = xgb.XGBRegressor(objective='reg:squarederror')
model_xgb.fit(train_X, train_y)
xgb_preds = model_xgb.predict(val_X)

print("MAE:", mean_absolute_error(val_y, xgb_preds))
print("MSE:", mean_squared_error(val_y, xgb_preds))

### 4.5 Cat Boost Regressor

In [None]:
model_cbr = cbr(random_seed=242, verbose=0, early_stopping_rounds=10)
model_cbr.fit(train_X, train_y)
cbr_preds = model_cbr.predict(val_X)

print("MAE:", mean_absolute_error(val_y, cbr_preds))
print("MSE:", mean_squared_error(val_y, cbr_preds))

----------------------------------------------------------------------------------------------------------------
### Summary:
MSE of 5 Regressor models evaluated:
    1. Decision Tree Regressor : 2537.40
    2. Random Forest Regressor : 2355.67
    3. LGBM Regressor          : 2362.56
    4. XGB Regressor           : 2373.63
    5. Cat Boost Regressor     : 2347.23
    
#### Cat Boost Regressor is considered for further investigation as it is having the lowest MSE.

### Building ML Pipeline

Creating pipeline. 
Splitting categorical and numerical columns.
Performing One Hot encoder on categorical columns.
Model evaluation.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


numerical_cols = ['geo_id', 'order_id', 'ad_unit_id', 'total_impressions', 'viewable_impressions']
categorical_cols = ['site_id', 'ad_type_id', 'device_category_id', 'advertiser_id', 'line_item_type_id', 
                    'os_id','monetization_channel_id', 'weekday']

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

model = cbr(random_seed=242, verbose=0, early_stopping_rounds=10)


# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
my_pipeline.fit(train_X, train_y)
preds = my_pipeline.predict(val_X)

# Evaluate the model
print('MAE:', mean_absolute_error(val_y, preds))
print('MSE:', mean_squared_error(val_y, preds))

### Cross-validation to validate the model for over-fitting

In [None]:
from sklearn.model_selection import cross_val_score

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', cbr(random_seed=242, verbose=0, early_stopping_rounds=10))])

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')

print("MAE scores:", scores)

print("Average MAE score (across experiments):", scores.mean())

In [None]:
from sklearn.model_selection import cross_val_score

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', cbr(random_seed=242, verbose=0, early_stopping_rounds=10))])

# Multiply by -1 since sklearn calculates *negative* MSE
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')

print("MSE scores:", scores)

print("Average MSE score (across experiments):", scores.mean())

# 5. Predictions and Evaluation

#### Defining Hybrid Ensemble Learning Model to increase prediction efficiency

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

estimators = []

model1 = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
estimators.append(('logistic1', model1))
model2 = RandomForestRegressor(random_state=1)
estimators.append(('logistic2', model2))
model3 = lgb.LGBMRegressor(num_leaves=41, n_estimators=200)
estimators.append(('logistic3', model3))
model4 = xgb.XGBRegressor(objective='reg:squarederror')
estimators.append(('logistic4', model4))
model5 = cbr(random_seed=242, verbose=0, early_stopping_rounds=10)
estimators.append(('logistic5', model5))

# Defining the ensemble model
ensemble = VotingRegressor(estimators)
ensemble.fit(train_X, train_y)
y_pred = ensemble.predict(val_X)

# Evaluate the model
print('MAE:', mean_absolute_error(val_y, y_pred))
print('MSE:', mean_squared_error(val_y, y_pred))

In [None]:
boost_df= pd.DataFrame({})

boost_df['Actual_CPM']= val_y

boost_df['Pred_LGB_CPM']= lgb_preds
boost_df['Pred_XGB_CPM']= xgb_preds
boost_df['Pred_CBR_CPM']= cbr_preds
boost_df['Pred_DTR_CPM']= val_predictions
boost_df['Pred_RFR_CPM']= for_preds
boost_df['Pred_Voting_CPM'] = y_pred
boost_df.sample(n=10)

In [None]:
boost_df.describe()

In [None]:
revenue_df = pd.DataFrame({'Actual_Impressions': val_X['total_impressions'].values,  'Actual_CPM': val_y, 
                           'Pred_Voting_CPM': boost_df['Pred_Voting_CPM'].values})

revenue_df['Pred_Revenue'] = revenue_df['Pred_Voting_CPM'] * revenue_df['Actual_Impressions'] / (1000 * 100)
revenue_df['Pred_Revenue'] = revenue_df['Pred_Revenue'].clip(lower=0)
revenue_df.sample(n=10)

In [None]:
revenue_df.describe()

# 6. Questions

## 1. What is the potential revenue range our publisher can make in July?

In [None]:
print('Average revenue of june month:', np.round(X_orig["total_revenue"].mean(),2))
print('Predicted approximate revenue for july month:', np.round(revenue_df["Pred_Revenue"].mean(),2))

----------------------------------------------------------------------------------------------------------------
### Solution 1: Approximately our publisher in July can make revenue in the range of 0.05 to 0.07.
----------------------------------------------------------------------------------------------------------------

## 2. What is the reserve prices that he/she can set ?

In [None]:
print('Reserve price of june month:', np.round(boost_df["Actual_CPM"].max(),2))
print('Predicted approximate Reserve price for july month:',np.round(boost_df["Pred_Voting_CPM"].max(),2))

----------------------------------------------------------------------------------------------------------------
### Solution 2: Predicted reserve prices one can set in the range of 522.15 to 526.92.
----------------------------------------------------------------------------------------------------------------