In [1]:
import warnings
warnings.filterwarnings('ignore')

# <span style="color:Maroon">Case Study: Usage of Data Science in Entertainment Industry.

#### <span style="color:Green">Select optimal advetisement to display in break time for a TV Show, based on what was recently shown in the TV

## <span style="color:Maroon">Explorotary Data Analysis

<span style="color:Green">There are multiple ways to optimize advertisement. Broadly, it can be based on:

<span style="color:Green">$\;\;\;\;\;\;$1. Based on show context: what is show about, and what was recently shown on TV
    
<span style="color:Green">$\;\;\;\;\;\;$2. Based on target audience demographics

## <span style="color:Maroon">Problem Statement:
###### <span style="color:Green">The goal of this test is to build a product that identifies when the Output Labels are present, e.g. a moment in time where hot drinks were present on the screen, based on the Recognition confidence scores.

###### <span style="color:Green">The file is a CSV sorted by show_name and time_offset. The most important columns are:

<span style="color:Green">$\;\;\;\;\;\;$o show_name is the name of the television program, for example Fresh Meat or Hollyoaks. 
    
<span style="color:Green">$\;\;\;\;\;\;$o time_offset is the timestamp, in seconds since the beginning of the episode.
    
<span style="color:Green">$\;\;\;\;\;\;$o r_abies to r_zoo are the Input Features produced by AWS Rekognition, for the screenshot at that time_offset. 
    
<span style="color:Green">$\;\;\;\;\;\;$o h_alcohol_str to h_phone_str are the Output Labels that you’re trying to predict

<span style="color:Green">__Pick a single Output Label (like Food or Hot Drinks) and build a model in a Jupyter Python notebook that tackles it.__
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

## <span style="color:Maroon">Objective: Build a classification model to predict where food was present (dependent variable = 'h_food_str')

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import os as os
np.random.seed(0)
pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas._config.config.option_context at 0x220698c3d88>

In [3]:
# Setting the working directory to the folder
direc = os.getcwd()
os.chdir("..//Data//")

In [4]:
# Read the data in pandas dataframe
data = pd.read_csv("Dataset.csv")
data = data.fillna(0)

### <span style="color:Maroon">EDA: Exploratory Data Analysis

In [5]:
data.describe()

Unnamed: 0,time_offset,r_abies,r_abyssinian,r_accessories,r_accipiter,r_acorn,r_adapter,r_adorable,r_adventure,r_aerial_view,...,r_yak,r_yard,r_yew,r_zebra,r_zebra_crossing,r_zoo,h_alcohol_str,h_food_str,h_hot_drink_str,h_phone_str
count,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,...,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0,16568.0
mean,1154.162723,0.000338,6.3e-05,0.016379,3.6e-05,3.2e-05,0.000479,0.002098,5e-05,0.001141,...,0.0,0.001586,0.006647,7.8e-05,8.3e-05,0.000105,0.199239,0.088363,0.040983,0.057219
std,769.734098,0.013128,0.005775,0.092024,0.004661,0.004112,0.017332,0.03443,0.006475,0.028809,...,0.0,0.028863,0.06115,0.007127,0.00772,0.007856,0.399441,0.283831,0.198256,0.232267
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,518.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1036.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1724.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2884.0,0.516965,0.527265,0.837948,0.599949,0.529311,0.847798,0.86148,0.833427,0.971708,...,0.0,0.638517,0.939912,0.663115,0.81752,0.657645,1.0,1.0,1.0,1.0


In [6]:
# Our dependent variable is 'h_food_str'. Analyze the effect of show type on dependent variable
target = 'h_food_str'
data[target].describe()

count    16568.000000
mean         0.088363
std          0.283831
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: h_food_str, dtype: float64

<span style="color:Blue">__Comments:__The response rate = 8.84%. Even a naive classifier which always predict zero, will have an accuracy of 91.16%, but with recall=0. Hence, our objective for the task is to try to build a classifier with better recall, while ensuring accuracy.

In [7]:
# Other dependent variables (These should be dropped from our dataset)
dv_list = ['h_alcohol_str', 'h_hot_drink_str', 'h_phone_str']

<span style="color:Blue">__Comments:__In future, these labels won't be available to us (dv_list). Hence, for our modeling purpose, these variables are being dropped from the data.

In [8]:
# drop the dv_list from the data
data = data.drop(dv_list, axis=1)

In [9]:
# Cross tab of "show_name" with dependent variable
pd.crosstab(data['show_name'], data[target])

h_food_str,0,1
show_name,Unnamed: 1_level_1,Unnamed: 2_level_1
fresh_meat,1609,850
friday_night_dinner,1067,371
hollyoaks,1378,22
made_in_chelsea,2724,160
made_in_chelsea_la,2761,52
my_mad_fat_diary,2879,0
peep_show,1384,0
the_inbetweeners,1302,9


__Comments:__ <span style="color:Blue">Chi2 test of independence to see if this is a relevant variable

 ##### <span style="color:Maroon">Chi2 Test of Independence
    """
    Null hypothesis: Variables are independent
    If p-value <= alpha: significant result, reject null hypothesis (H0) {or infer that variables are dependent}.
    If p-value > alpha: not significant result, fail to reject null hypothesis(H0)
    For our case, we shall assume an alpha = 0.05
    """

In [10]:
from scipy.stats import chi2_contingency
crosstab = pd.crosstab(data['show_name'],data[target])
stat, p, dof, ex = chi2_contingency(crosstab)
print (p)

0.0


<span style="color:Blue">__Comments:__ The p-value is 0.0. Thus we can reject the Null-Hypothesis that the variables are independent. This implies, that the type of show is a big influence on whether food items are common occurance or not. This also follows from the Empirical evidence.

In [11]:
# One hot encoding of 'show_name' variable
df1 = pd.get_dummies(data['show_name'], prefix = 'show_name')
data = data.join(df1)
data.head()

Unnamed: 0,show_name,time_offset,r_abies,r_abyssinian,r_accessories,r_accipiter,r_acorn,r_adapter,r_adorable,r_adventure,...,r_zoo,h_food_str,show_name_fresh_meat,show_name_friday_night_dinner,show_name_hollyoaks,show_name_made_in_chelsea,show_name_made_in_chelsea_la,show_name_my_mad_fat_diary,show_name_peep_show,show_name_the_inbetweeners
0,fresh_meat,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
1,fresh_meat,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
2,fresh_meat,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
3,fresh_meat,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
4,fresh_meat,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
