In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score 
from sklearn.metrics import mean_absolute_percentage_error

from datetime import datetime as dt
from functools import reduce
from yellowbrick.regressor import residuals_plot
from transformations import transform_one_column, select_best_feature, feature_imp_random_forest

import time
import ipywidgets as widgets
from ipywidgets import FileUpload
import datetime

from IPython.display import display
import io
import re
from scipy.optimize import minimize, LinearConstraint

import holidays

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score 
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

from datetime import datetime as dt
from functools import reduce
from yellowbrick.regressor import residuals_plot
from transformations import transform_one_column, select_best_feature, feature_imp_random_forest

import time
import datetime
import ipywidgets as widgets
from ipywidgets import FileUpload

from IPython.display import display
import io
import re
from scipy.optimize import minimize, LinearConstraint

import holidays

import panel as pn
pn.extension()

development begins here

In [8]:
class MediaMixModel:
    
    def __init__(self, client_name, country, target, data_dict=None):
        """
            client_name: (str) name of client for this MediaMixModel object
            country: (str) country
            target: (str) name of dependent (we want to predict this) variable (i.e. Revenue or Orders)
            data_dict: (dict) country name : data matrix, output of clean_and_merge
        """
        
        self.client_name = client_name
        self.target = target
        self.country = country
        print(f"This model aims to predict {self.target} in {self.country}for {self.client_name}.")
        
        self.data = Dataset(df=data_dict[country], target=self.target)
        
        
        
    
    def fit_ad_parameters(self):
        """
            fits adstock parameters (L, theta, alpha) and diminishing returns parameter (power) and transforms the data accordingly
            this process is repeated for every country in self.countries, and initializes self.data_matrix_dict
            
            returns:
                None
            
            calls apply_transformations
            
        """
        
        if self.data is None:
            print("The model has no data to train on!")
            return
        
        data_matrix = self.apply_transformations(country=country)
        self.data_matrix = data_matrix
        
        print("Transformations Complete!")
        return
    
    
    def apply_transformations(self):
        """
            fits adstock parameters (L, theta, alpha) and diminishing returns parameter (power) and transforms the data accordingly
            this process is done for a single country (a single dataset)
            
            input:
                country: (str) country
            
            output:
                data_matrix: (DataFrame) post-transformations
        """
        
        x, y = self.data.get_train()
        df = pd.concat([x, y], axis=1)

        # apply adstock/carryover
        tdf = best_adstock(d, self.target)
        imp_vars = get_impression_vars(d)
        df = pd.concat([tdf, d[[response_var]]], axis=1)
        
        # apply diminshing returns
        tdf = best_diminishing_returns(df, response_var)
        data_matrix = pd.concat([tdf, d[imp_vars + ['holiday', response_var]]], axis=1)

        print("number of nulls post transformations =", data_matrix.isna().any().sum())
        return data_matrix.fillna(0)
    
    def best_adstock(self, df, response_var='Revenue'):
        """
            input:
                df: sales+ad dataframe
            output
                df_best: sales+ad dataframe with best adstock transformation applied to each function
        """
        
        df_best = pd.DataFrame(index=df.index)
        
        media_vars = get_media_vars(df)

        for col in media_vars:
            tdf, param = apply_adstock(df, col)

            correlations = tdf.corrwith(df[response_var].astype(float))

            best_index = correlations.argmax()

            feature_name = correlations.index[best_index]
            r_value = correlations.iloc[best_index]

            print(f" {feature_name} || r-value of {r_value}")

            df_best[feature_name] = tdf[feature_name]

        return df_best

    def apply_adstock(self, df, column_name):
        """
            inputs:
                df: sales and ad data 
                column_name: (str) valid column name in string

            output:
                returns ~1000 possible columns with unique transformations
        """

        df_transformations = pd.DataFrame()
        v = df[column_name].values

        for alpha in np.arange(0.1, 1, 0.1):
            for L in np.arange(0, 30, 2):
                for theta in [0]:
                    col = f"{column_name}_alpha={alpha}L={L}theta={theta}"
                    df_transformations[col] = carryover(x=v, alpha=alpha, L=L, theta=theta)

        df_transformations = df_transformations.set_index(df.index)

        return df_transformations, (alpha, L, theta)
    
    def get_media_vars(df):
        return [col for col in df.columns if "Media" in col and "Cost" in col]

    def get_impression_vars(df):
        return [col for col in df.columns if "Impression" in col]
    
    

    
    
    
        
    
        
        
    

In [None]:
class Dataset:
    
    def __init__(self, df, target):
        df = df.sort_index(ascending=True)
        self.y = df[target]
        self.x = df.drop(columns=[target])
        self.split(validation_size=0.1, test_size=0.1)
        
    
    def split(self, validation_size=0.1, test_size=0.1):
        """
            generates the train/validation/test split based on time
            for example, the first 80% of the data goes to train, the next 10% of the data goes to validation, the last 10% goes to testing
        """
        if validation_size + test_size > 0.5:
            validation_size = 0.1
            test_size = 0.1
        
        idx = {}
        n = len(self.x)
        
        b1 = int(n * (1 - validation_size - test_size))
        b2 = int(n * (1 - test_size))
        
        idx['train'] = np.arange(b1)
        idx['valid'] = np.arange(b1, b2)
        idx['test'] = np.arang(b2, n)
        
        self.idx = idx
    
    def get_train(self):
        return self.x.iloc[idx['train'], :], self.y.iloc[idx['train']]
    
    def get_valid(self):
        return self.x.iloc[idx['valid'], :], self.y.iloc[idx['valid']]
    
    def get_test(self):
        return self.x.iloc[idx['test'], :], self.y.iloc[idx['test']]
    