In [1]:
import warnings
warnings.filterwarnings('ignore')

# <span style="color:Maroon">Case Study: Usage of Data Science in Entertainment Industry.

#### <span style="color:Green">Select optimal advetisement to display in break time for a TV Show, based on what was recently shown in the TV

## <span style="color:Maroon">Naive Classifier (Based on Sum of Indepent Variables)

<span style="color:Green">A simple implementation of Naive Classifier, which follows our first conclusions when we look at data. Here we pick independent variables which are highly correlated with dependent variable, and use them for prediction. Since we have all binary variables, and a binary prediciton problem, a Max(Sum(Correlated Independent Variables)) could be a good prediction. 

<span style="color:Green">__NEVER USE THIS AS FINAL MODEL. THIS IS JUST FOR ILLUSTRATION PURPOSE__

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import os as os
from scipy.stats import chi2_contingency
np.random.seed(0)
pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas._config.config.option_context at 0x2163a9b4788>

In [3]:
# Setting the working directory to the folder
direc = os.getcwd()
os.chdir("..//Data//")

In [4]:
# Read the data in pandas dataframe
data = pd.read_csv("Dataset.csv")
data = data.fillna(0)

In [5]:
# Dependent variable (is given name of target)
target = 'h_food_str'
# Other dependent variables (These should be dropped from our dataset)
dv_list = ['h_alcohol_str', 'h_hot_drink_str', 'h_phone_str']
# drop the dv_list from the data (as In future, these labels won't be available to us (dv_list).
# Hence, for our modeling purpose, these variables are being dropped from the data.)
data = data.drop(dv_list, axis=1)

In [6]:
# One hot encoding of 'show_name' variable
df1 = pd.get_dummies(data['show_name'], prefix = 'show_name')
data = data.join(df1)
data.head()

Unnamed: 0,show_name,time_offset,r_abies,r_abyssinian,r_accessories,r_accipiter,r_acorn,r_adapter,r_adorable,r_adventure,...,r_zoo,h_food_str,show_name_fresh_meat,show_name_friday_night_dinner,show_name_hollyoaks,show_name_made_in_chelsea,show_name_made_in_chelsea_la,show_name_my_mad_fat_diary,show_name_peep_show,show_name_the_inbetweeners
0,fresh_meat,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
1,fresh_meat,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
2,fresh_meat,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
3,fresh_meat,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
4,fresh_meat,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0


### <span style="color:Maroon">Train-Test Split
<span style="color:Green">Divide the data in 70:30 ratio for training the model and validating the model

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Split the data in train and test
X_train, X_test, y_train, y_test = train_test_split(data.drop([target],axis=1), data[target], test_size=0.30, 
                                                    random_state=12345)

In [9]:
y_train = y_train.ravel()
y_test = y_test.ravel()

In [10]:
# Check if the random samples have similar mean
print (f"Train Dependent variable: {np.round(np.mean(y_train)*100,2)}")
print (f"Test Dependent variable: {np.round(np.mean(y_test)*100,2)}")

Train Dependent variable: 8.83
Test Dependent variable: 8.85


<span style="color:Blue">__Comments:__ The means of dependent variable are very similar in test and train dataset

### <span style="color:Maroon">Part 2: Naive Model

##### <span style="color:Blue">Here we shall try to look at variables and pick important variables to predict the final outcome

In [11]:
# import base model libraries from sklearn
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.metrics import classification_report

In [12]:
# Copy the dataframe to a new df
data_nv2 = data.copy()

# convert the 'show_name' to index
data_nv2 =  data_nv2.set_index('show_name')

In [13]:
# Convert all values '>0' to '1' in all columns
data_nv2 = data_nv2.apply(np.ceil)

##### <span style="color:Green">To pick important variables, we shall use concept of Information value and chi2 test of independence:
<span style="color:Green">$\;\;\;\;\;\;$Step 1: Calculate Information value for all independent variables w.r.t target variable
    
<span style="color:Green">$\;\;\;\;\;\;$Step 2: Do Chi2 test of independence for all independent variables w.r.t target variable
    
<span style="color:Green">$\;\;\;\;\;\;$Step 3: Pick the variables which have p-val <= 0.05 (we are able to reject the null hypothesis of independence)
                                                                            
<span style="color:Green">$\;\;\;\;\;\;$Step 4: From shortlisted variables, pick the ones which have a normalized cumulative IV of 0.90
    
<span style="color:Green">$\;\;\;\;\;\;$Step 5: Use these variables to predict the outcome

In [14]:
# Information Value function
def IV_calc(df, independent_var, target_var):
    """
    This function takes three inputs:
                     df: dataset
        independent_var: Independent variable name
             target_var: Dependent variable name
            
    Output: Function returns the Information value
    """
    dftsum = df[[independent_var,target_var]].groupby(independent_var).sum()
    dftsum.columns = ["Goods"]
    dftcount = df[[independent_var,target_var]].groupby(independent_var).count()
    dftcount.columns = ["Total"]
    dfiv = dftsum.join(dftcount,how='left')
    dfiv["Bads"] = dfiv["Total"] - dfiv["Goods"]
    dfiv["Perct_Goods"] = dfiv["Goods"]/dfiv["Goods"].sum()
    dfiv["Perct_Bads"] = np.where(dfiv["Bads"] > 0, dfiv["Bads"]/dfiv["Bads"].sum(), 0.01)
    dfiv["WOE"] = np.log(np.where((dfiv["Perct_Goods"]/dfiv["Perct_Bads"])>0,(dfiv["Perct_Goods"]/dfiv["Perct_Bads"]),1))
    dfiv["IV"] = ((dfiv["Perct_Goods"] - dfiv["Perct_Bads"])*dfiv["WOE"])
    return max(dfiv["IV"].sum(),0)

In [15]:
# Chi square test of Independence
def Chi2_test_of_Independence(df, independent_var, target_var):
    """
    Null hypothesis: Variables are independent
    If p-value <= alpha: significant result, reject null hypothesis (H0) {or infer that variables are dependent}.
    If p-value > alpha: not significant result, fail to reject null hypothesis(H0), {or infer variables are independent}
    For our case, we shall assume an alpha = 0.05
    """
    crosstab = pd.crosstab(df[independent_var],df[target_var])
    stat, p, dof, ex = chi2_contingency(crosstab)
    return (p)

In [16]:
# Select the independent variables 
columns = list(data_nv2.columns)
columns.remove(target)

In [17]:
# Declare emply lists to store the value of p-val and IV
p_val = [None]*len(columns)
IV_val = [None]*len(columns)

In [18]:
# Calculate Information value and conduct Chi2 test of independence for independnet variables
for i in range(0,len(columns)):
    p_val[i] = Chi2_test_of_Independence(data_nv2, columns[i], target)
    IV_val[i] = IV_calc(data_nv2, columns[i], target)

In [19]:
# Store the information in Pandas Dataframe for future use
DF_IV_CHI2 = pd.DataFrame(columns)
DF_IV_CHI2.columns = ["variables"]
DF_IV_CHI2["iv"] = IV_val
DF_IV_CHI2["p_val"] = p_val
DF_IV_CHI2.head()

Unnamed: 0,variables,iv,p_val
0,time_offset,0.6333471,1.0
1,r_abies,0.0006396791,0.574729
2,r_abyssinian,1.753496e-08,0.420564
3,r_accessories,0.002340507,0.074956
4,r_accipiter,4.383595e-09,0.146953


In [20]:
# Select variables, which have p-val <= 0.05 (We reject null hypothesis)
DF_IV_CHI2_CP = DF_IV_CHI2.copy()
DF_IV_CHI2_CP = DF_IV_CHI2_CP[DF_IV_CHI2_CP["p_val"] <= 0.05]

In [21]:
# Sort the IV val from max to min and take a cumulative sum
DF_IV_CHI2_CP = DF_IV_CHI2_CP.sort_values("iv", ascending=False)
DF_IV_CHI2_CP["Cum_IV"] = DF_IV_CHI2_CP["iv"].cumsum()/DF_IV_CHI2_CP["iv"].sum()
top_vars = list(DF_IV_CHI2_CP[DF_IV_CHI2_CP["Cum_IV"] <= 0.95]["variables"])

In [22]:
# Print the top variables
print(top_vars)

['show_name_fresh_meat', 'r_cafeteria', 'r_meal', 'r_dinner', 'r_supper', 'r_food', 'r_restaurant', 'r_food_court', 'r_cafe', 'r_plate', 'r_dish', 'show_name_friday_night_dinner', 'show_name_made_in_chelsea_la', 'r_glass', 'show_name_the_inbetweeners', 'r_hot_pot', 'r_beverage', 'show_name_hollyoaks', 'r_drink', 'r_eating', 'r_car', 'r_lamp', 'r_lampshade', 'r_lighting', 'r_automobile', 'r_breakfast', 'r_swimwear', 'r_downtown', 'r_beard', 'r_wine', 'r_urban', 'r_goblet', 'r_city', 'r_pub', 'r_bar_counter', 'r_alcohol', 'r_bikini', 'r_poster', 'r_lunch', 'r_dirt_road', 'r_gravel', 'r_living_room', 'r_flyer', 'r_building', 'r_table_lamp', 'r_platter', 'r_table', 'r_couch', 'r_brochure', 'r_cup', 'r_apartment', 'r_shelf', 'r_light', 'r_flower', 'r_tablecloth', 'r_conifer', 'r_linen', 'show_name_made_in_chelsea', 'r_field', 'show_name_my_mad_fat_diary', 'r_grass', 'r_dining_table', 'r_vehicle', 'r_underwear', 'r_grassland', 'r_interior_design', 'r_lingerie', 'r_yew', 'r_bra', 'r_blossom',

In [23]:
food_related_vars = ['show_name_fresh_meat', 'r_cafeteria', 'r_meal', 'r_dinner', 'r_supper', 'r_food', 'r_restaurant',
                     'r_food_court', 'r_cafe', 'r_plate', 'r_dish', 'show_name_friday_night_dinner',
                     'show_name_made_in_chelsea_la', 'r_glass', 'show_name_the_inbetweeners', 'r_hot_pot', 'r_beverage', 
                     'show_name_hollyoaks', 'r_drink', 'r_eating', 'r_breakfast', 'r_beard', 'r_wine', 'r_goblet', 
                     'r_pub', 'r_bar_counter', 'r_alcohol', 'r_lunch', 'r_platter', 'r_cup', 'r_apartment', 
                     'show_name_made_in_chelsea', 'show_name_my_mad_fat_diary', 'r_dining_table', 'r_cookie', 'r_kitchen',
                     'r_pantry', 'r_glasses', 'r_oven', 'r_wine_glass']

<span style="color:Blue">__Comment:__ From the top variables, select only variables based on business understanding (i.e. only Food variables). Since, the independent variables used in this model are outcome of a predictive model, this model is a compound model, where error of first model gets compunded in second. Hence, we shall use business logic to identify variables which could have similarity with food

In [24]:
class Sum_Classifier(BaseEstimator, ClassifierMixin):
    def __self__(self):
        pass

    def fit(self, X=None, y=None):
        return self

    def predict(self, X):
        y = np.where(X.sum(axis=1)>0, 1, 0)
        return y

In [25]:
# Fitting the majority class model
sc = Sum_Classifier()
sc.fit(X_train[food_related_vars],y_train)

Sum_Classifier()

In [26]:
# Prediciting the class for training dataset
y_train_class_pred = sc.predict(X_train[food_related_vars])

In [27]:
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
print("Performance Metrics for Training Sample")
print(classification_report(y_train, y_train_class_pred))

tmp = pd.DataFrame(y_train, columns=["Actual"])
tmp["Predicted"] = y_train_class_pred
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("\nConfusion Matrix\n")
print(pd.crosstab(tmp["Actual"],tmp["Predicted"]))

+++++++++++++++++++++++++++++++++++++++++++++++++++++

Performance Metrics for Training Sample
              precision    recall  f1-score   support

           0       1.00      0.07      0.12     10573
           1       0.09      1.00      0.17      1024

    accuracy                           0.15     11597
   macro avg       0.55      0.53      0.15     11597
weighted avg       0.92      0.15      0.13     11597

+++++++++++++++++++++++++++++++++++++++++++++++++++++

Confusion Matrix

Predicted    0     1
Actual              
0          694  9879
1            0  1024


<span style="color:Blue">__Comments:__ For Class==1, though the model has a recall of 1, but the accuracy is only 0.15. Hence not a very good model

In [28]:
# Prediciting the class for training dataset
y_test_class_pred = sc.predict(X_test[food_related_vars])

In [29]:
# Peroformance for test sample:
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
print("Performance Metrics for Test Sample")
print(classification_report(y_test, y_test_class_pred))

tmp = pd.DataFrame(y_test, columns=["Actual"])
tmp["Predicted"] = y_test_class_pred
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("\nConfusion Matrix\n")
print(pd.crosstab(tmp["Actual"],tmp["Predicted"]))

+++++++++++++++++++++++++++++++++++++++++++++++++++++

Performance Metrics for Test Sample
              precision    recall  f1-score   support

           0       1.00      0.07      0.13      4531
           1       0.09      1.00      0.17       440

    accuracy                           0.15      4971
   macro avg       0.55      0.53      0.15      4971
weighted avg       0.92      0.15      0.13      4971

+++++++++++++++++++++++++++++++++++++++++++++++++++++

Confusion Matrix

Predicted    0     1
Actual              
0          315  4216
1            0   440


<span style="color:Blue">__Comments:__ For Class == , the model has a recall of 1, but precision and accuracy of only 0.15, which is very low. Hence, not a very good model