# OTV Vimeo Video Data

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path

import pandas as pd # pandas
import numpy as np
import os
import re
from datetime import datetime, timedelta, date
from pathlib import Path
import plotly.express as px

In [2]:
def get_window_datetimes(file_name):
    file_name = file_name.replace("-", "_")
    re_string = r'\((.*?)\)' # regex string for finding window start and end dates

    match = re.findall(re_string, file_name)
    start = match[0].split("_")
    start_month = int(start[0])
    start_day = int(start[1])
    start_year = int(start[2])
    start_dt = datetime(month=start_month, day=start_day, year=start_year)
    
    end = match[1].split("_")
    end_month = int(end[0])
    end_day = int(end[1])
    end_year = int(end[2])
    end_dt = datetime(month=end_month, day=end_day, year=end_year)
    return start_dt, end_dt

# Loading Data

In [3]:
vimeo_video_dir_loc = Path('../data/OTV_DATA_05142020/OTV - Shared Data/Vimeo/Vimeo Video')

data_dir = vimeo_video_dir_loc.glob('*.xlsx')

video_dfs = []
for filepath in tqdm(data_dir):
    df = pd.read_excel(str(filepath), index_loc=0)
    start_dt, end_dt = get_window_datetimes(filepath.stem)
    df['start_dt'] = start_dt
    df['end_dt'] = end_dt
    df['month'] = start_dt.month
    df['year'] = start_dt.year
    df = df.drop(columns=['Unnamed: 0', 'index'], errors='ignore')
    df = df[df['name'].notna()]
    df = df[df['uri'].notna()]
    video_dfs.append(df)
    #print("{:%b %Y}".format(start_dt))
    
video_df = pd.concat(video_dfs)
video_df = video_df.reset_index()
video_df = video_df.drop(columns=['index'])


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [4]:
video_df.head()

Unnamed: 0,plays,downloads,loads,finishes,likes,comments,uri,name,duration,created_time,...,mean_percent,mean_seconds,sum_seconds,total_seconds,unique_viewers,Project ID,start_dt,end_dt,month,year
0,558,0.0,1067.0,24.0,0.0,0.0,/videos/143933380,Open TV Presents: Southern for Pussy,283.0,2015-10-28T22:38:55+00:00,...,38.0,79.0,44138.0,158183.0,372.0,SFP,2019-10-01,2019-10-31,10,2019
1,447,0.0,2372.0,86.0,2.0,0.0,/videos/203924325,Brown Girls -- Episode 1,410.0,2017-02-14T00:43:46+00:00,...,66.0,224.0,100490.0,183675.0,284.0,BG,2019-10-01,2019-10-31,10,2019
2,308,0.0,1669.0,37.0,0.0,0.0,/videos/189724238,Brown Girls -- Trailer,141.0,2016-11-01T01:03:06+00:00,...,50.0,47.0,14677.0,43687.0,201.0,BG,2019-10-01,2019-10-31,10,2019
3,259,0.0,595.0,54.0,1.0,0.0,/videos/204010747,Brown Girls -- Episode 2,874.0,2017-02-14T14:31:09+00:00,...,69.0,510.0,132291.0,226607.0,165.0,BG,2019-10-01,2019-10-31,10,2019
4,201,0.0,583.0,33.0,0.0,0.0,/videos/335757602,Just Call Me Ripley- Episode 1,365.0,2019-05-13T00:42:39+00:00,...,70.0,229.0,46054.0,73552.0,127.0,JCMR,2019-10-01,2019-10-31,10,2019


In [5]:
# strip white space from Project IDs
video_df['Project ID'] = video_df["Project ID"].apply(lambda x: x.strip())

# remove "Untitled" rows
video_df = video_df[video_df['name'] != 'Untitled']

In [52]:
# project codes
#project_codes_df = pd.read_csv('../resolved_project_codes_4-23-2020.csv', index_col=0)
project_codes_df = pd.read_csv('../project_codes_05152020.csv', index_col=0)
indeces = [ind.strip() for ind in project_codes_df.index] # some project IDs have whitespace characters that mess up the Phrase Matcher
project_codes_df.index = indeces

In [53]:
project_codes_df.head()

Unnamed: 0,Post Example,Match Keys,Formal Name,Is OTV project?,Notes
ANO,Kiam Marcelo Junio -- The Artists of Nupita Obama,Kiam Marcelo Junio,Nupita Obama Artists,PRESENTS,There will be two other videos for Erik Wallac...
AP,,The Algebra Project,The Algebra Project,FALSE,
AS,Afternoon Snatch -- Episode 1,Afternoon Snatch,Afternoon Snatch,ORIGINALS,
AV,Open TV Presents - ambivert by ester alegria,Ambivert,Ambivert,PRESENTS,
BF,\n,Brave Futures,Brave Futures,PRESENTS,Note this will include around 12 separate shor...


# Data Completion

In [8]:
years_df = video_df.groupby('year')

for year, df in years_df:
    print("%s: %s" % (year, len(df['month'].unique())))

2017: 12
2018: 12
2019: 12
2020: 1


# Split Project IDs with multiple IDs into two rows

In [9]:
split_rows = []
rows_to_drop = []

for ind, row in video_df.iterrows():
    if "|" in row['Project ID']:
        proj_id1, proj_id2 = row['Project ID'].split("|")
        row1 = row.copy()
        row2 = row.copy()
        row1['Project ID'] = proj_id1
        row2['Project ID'] = proj_id2
        split_rows.append(row1)
        split_rows.append(row2)
        rows_to_drop.append(ind)

In [10]:
video_df = video_df.drop(index=rows_to_drop).reset_index()
for row in split_rows:
    video_df.loc[len(video_df)] = row

In [11]:
video_df.tail()

Unnamed: 0,index,plays,downloads,loads,finishes,likes,comments,uri,name,duration,...,mean_percent,mean_seconds,sum_seconds,total_seconds,unique_viewers,Project ID,start_dt,end_dt,month,year
8635,,1,0.0,17.0,0.0,0.0,0.0,/videos/331153616,Right Swipe & FOBia on WCIU,252.0,...,30.0,77.0,77.0,252.0,1.0,FOB,2020-03-01,2020-03-31,3,2020
8636,,1,0.0,18.0,1.0,0.0,0.0,/videos/337911085,For Your Emmy Consideration: DAMAGED GOODS & T...,60.0,...,98.0,59.0,59.0,60.0,1.0,DG,2020-03-01,2020-03-31,3,2020
8637,,1,0.0,18.0,1.0,0.0,0.0,/videos/337911085,For Your Emmy Consideration: DAMAGED GOODS & T...,60.0,...,98.0,59.0,59.0,60.0,1.0,TT,2020-03-01,2020-03-31,3,2020
8638,,1,0.0,31.0,1.0,0.0,0.0,/videos/337911085,For Your Emmy Consideration: DAMAGED GOODS & T...,60.0,...,98.0,59.0,59.0,60.0,0.0,DG,2019-05-01,2019-05-31,5,2019
8639,,1,0.0,31.0,1.0,0.0,0.0,/videos/337911085,For Your Emmy Consideration: DAMAGED GOODS & T...,60.0,...,98.0,59.0,59.0,60.0,0.0,TT,2019-05-01,2019-05-31,5,2019


# Uncoded?

In [12]:
row_ids_not_handled = [ind for ind, row in video_df.iterrows() if row['Project ID'] not in project_codes_df.index]

In [13]:
not_handled_df = video_df.loc[row_ids_not_handled]
not_handled_df.head(30)[['name','month','year','Project ID']]

Unnamed: 0,name,month,year,Project ID
2184,OTV Unapologetic Trailer - Cycle 4 2019,4,2019,UN


In [14]:
# save list to share with Rey and Gabby
not_handled_df.to_csv('../data/vimeo_video_unhandled_project_ids.csv')

# Most Popular Series by year (views and loads)?
* exclude *Brown Girls* and *Brujos* which are easily most popular

In [15]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [16]:
project_codes_df.head()

Unnamed: 0,Post Example,Match Keys,Formal Name,Is OTV project?,Notes
ANO,Kiam Marcelo Junio -- The Artists of Nupita Obama,Kiam Marcelo Junio,Nupita Obama Artists,PRESENTS,There will be two other videos for Erik Wallac...
AP,,The Algebra Project,The Algebra Project,FALSE,
AS,Afternoon Snatch -- Episode 1,Afternoon Snatch,Afternoon Snatch,ORIGINALS,
AV,Open TV Presents - ambivert by ester alegria,Ambivert,Ambivert,PRESENTS,
BF,\n,Brave Futures,Brave Futures,PRESENTS,Note this will include around 12 separate shor...


In [17]:
project_codes_df.head()

Unnamed: 0,Post Example,Match Keys,Formal Name,Is OTV project?,Notes
ANO,Kiam Marcelo Junio -- The Artists of Nupita Obama,Kiam Marcelo Junio,Nupita Obama Artists,PRESENTS,There will be two other videos for Erik Wallac...
AP,,The Algebra Project,The Algebra Project,FALSE,
AS,Afternoon Snatch -- Episode 1,Afternoon Snatch,Afternoon Snatch,ORIGINALS,
AV,Open TV Presents - ambivert by ester alegria,Ambivert,Ambivert,PRESENTS,
BF,\n,Brave Futures,Brave Futures,PRESENTS,Note this will include around 12 separate shor...


In [18]:
project_code_dict = {ind:row['Formal Name'] for ind, row in project_codes_df.iterrows()}

In [41]:
def plot_engagement(metric, video_df, agg_func=np.sum, \
                    years_to_consider=[2017,2018,2019], filter_projects=['BG', 'B'],\
                    max_range=True, project_type=None):
    
    fig = make_subplots(rows=len(years_to_consider), cols=1)

    max_y = 0

    for i, year in enumerate(years_to_consider):
        video_year_df = video_df[video_df['year'] == year]
        metric_df = video_year_df.groupby('Project ID').agg({metric: agg_func})
        metric_df['Project ID'] = metric_df.index
        if filter_projects:
            metric_df = metric_df[~metric_df['Project ID'].isin(filter_projects)]
        
        metric_df = pd.merge(metric_df, project_codes_df[['Formal Name','Is OTV project?']], \
                                      left_index=True, right_index=True)
        
        metric_df = metric_df.sort_values(by=metric, ascending=False)
        
        if project_type:
            metric_df = metric_df[metric_df['Is OTV project?'] == project_type]
        
        project_names = [project_code_dict.get(proj_id, 'None') for proj_id in metric_df['Project ID']]
        
        trace= go.Bar(x=project_names, y=metric_df[metric], name=year, width=1)
        fig.add_trace(trace, row=i+1, col=1)

        if metric_df[metric].max() > max_y:
            max_y = metric_df[metric].max()

    for i, year in enumerate(years_to_consider):
        fig.update_yaxes(title_text=metric, row=i+1, col=1)
        
        if max_range:
            fig.update_yaxes(range=[0, max_y], row=i+1, col=1)
            
        fig.update_xaxes(tickangle=45, tickfont=dict(family='Rockwell', color='black', size=8),\
                         row=i+1, col=1)
    
    
    fig.update_layout(height=1200, width=800)
    return fig

In [43]:
fig = plot_engagement('plays', video_df, project_type='ORIGINALS')
fig.update_layout(title='Number of plays by year for Original OTV programming')
fig.show()

In [44]:
fig = plot_engagement('downloads', video_df, project_type='ORIGINALS')
fig.update_layout(title='Number of downloads by year for Original OTV programming')
fig.show()

In [45]:
fig = plot_engagement('finishes', video_df, project_type='ORIGINALS')
fig.update_layout(title='Number of Finished viewings by year for Original OTV programming')
fig.show()

In [47]:
fig = plot_engagement('mean_percent', video_df, project_type='ORIGINALS', agg_func=np.mean)
fig.update_layout(title='Avg. Percent of Videos Finished by year for Original OTV programming')
fig.show()

In [58]:
fig = plot_engagement('plays', video_df, project_type='PRESENTS')
fig.update_layout(title='Number of plays by year for OTV Presents')
fig.show()

In [55]:
fig = plot_engagement('downloads', video_df, project_type='PRESENTS')
fig.update_layout(title='Number of downloads by year for OTV Presents')
fig.show()

In [56]:
fig = plot_engagement('finishes', video_df, project_type='PRESENTS')
fig.update_layout(title='Number of Finished viewings by year for OTV Presents')
fig.show()

In [57]:
fig = plot_engagement('mean_percent', video_df, project_type='PRESENTS', agg_func=np.mean)
fig.update_layout(title='Avg. Percent of Videos Finished by year for OTV Presents')
fig.show()

In [62]:
fig = plot_engagement('plays', video_df, project_type='REPRESENTS')
fig.update_layout(title='Number of plays by year for OTV Re-Presents')
fig.show()

In [61]:
fig = plot_engagement('downloads', video_df, project_type='REPRESENTS')
fig.update_layout(title='Number of downloads by year for OTV Re-Presents')
fig.show()

In [60]:
fig = plot_engagement('finishes', video_df, project_type='REPRESENTS')
fig.update_layout(title='Number of Finished viewings by year for OTV Re-Presents')
fig.show()

In [59]:
fig = plot_engagement('mean_percent', video_df, project_type='REPRESENTS', agg_func=np.mean)
fig.update_layout(title='Avg. Percent of Videos Finished by year for OTV Re-Presents')
fig.show()