In [293]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import os
from datetime import datetime, timedelta, date
import numpy as np

### Load Project Codes csv
Download the Project Codes spreadsheet on google as a `.csv` file and save it to the parent directory of this notebook

In [294]:
proj_codes_df = pd.read_csv('../project_codes.csv')
proj_codes_df = proj_codes_df.drop(columns=['name'])
proj_codes_df.head()

Unnamed: 0,Project Code,Formal Name,Still Active\n
0,ANO,Kiam Marcelo Junio,
1,AP,The Algebra Project,
2,AS,Afternoon Snatch,
3,AV,Ambivert,
4,BF,Brave Futures,


In [295]:
proj_codes_df.index=proj_codes_df['Project Code'] # sets the index as the project code

# Load the data we'll use for testing

In [296]:
def clean_df(df):
    df['project_id'] = df['Project ID'].fillna(df['Project Code'])
    df['project_id'] = df['project_id'].fillna(df['Unnamed: 0'])
    return df

In [297]:
def get_window_datetimes(file_name):
    file_name = file_name.replace("-", "_")
    re_string = r'\((.*?)\)' # regex string for finding window start and end dates

    match = re.findall(re_string, file_name)
    start = match[0].split("_")
    start_month = int(start[0])
    start_day = int(start[1])
    start_year = int(start[2])
    start_dt = datetime(month=start_month, day=start_day, year=start_year)
    
    end = match[1].split("_")
    end_month = int(end[0])
    end_day = int(end[1])
    end_year = int(end[2])
    end_dt = datetime(month=end_month, day=end_day, year=end_year)
    return start_dt, end_dt

In [298]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [299]:
# define locations of the files we want to use
data_dir = '../data/Spreadsheets_2019/'
time_window_dirs = os.listdir(data_dir) # << a list of the file names in the directory

In [306]:
vimeo_device_dfs = []
vimeo_region_dfs = []
vimeo_video_dfs = []
vimeo_date_dfs = []
vimeo_source_dfs = []

for window_dir in time_window_dirs:
    file_names = os.listdir(data_dir+window_dir)
    for file_name in file_names:
        start_dt, end_dt = get_window_datetimes(file_name)
        if 'Vimeo_Device' in file_name:
            vimeo_device_df = pd.read_csv(data_dir+window_dir+"/"+file_name)
            vimeo_device_df['start'] = start_dt
            vimeo_device_df['end'] = end_dt
            vimeo_device_dfs.append(vimeo_device_df)
        if 'Vimeo_Region' in file_name:
            vimeo_region_df = pd.read_csv(data_dir+window_dir+"/"+file_name)
            vimeo_region_df['start'] = start_dt
            vimeo_region_df['end'] = end_dt
            vimeo_region_dfs.append(vimeo_region_df)
        if 'Vimeo_Video' in file_name:
            video_df = pd.read_csv(data_dir+window_dir+"/"+file_name)
            video_df.columns = ['Project ID'] + video_df.columns[1:].to_list() # make the project id column name consistent
            video_df.plays = video_df.plays.replace('\r\n', np.nan)
            video_df = video_df[~video_df.isnull().all(axis=1)] # remove all completely null rows
            video_df['start'] = start_dt
            video_df['end'] = end_dt
            vimeo_video_dfs.append(video_df)
        if 'Vimeo_Date' in file_name:
            vimeo_date_df=pd.read_csv(data_dir+window_dir+"/"+file_name)
            vimeo_date_df['start'] = start_dt
            vimeo_date_df['end'] = end_dt
            vimeo_date_dfs.append(vimeo_date_df)
        if 'Vimeo_Source' in file_name:
            vimeo_source_df = pd.read_csv(data_dir+window_dir+"/"+file_name)
            vimeo_source_df['start'] = start_dt
            vimeo_source_df['end'] = end_dt
            vimeo_source_dfs.append(pd.read_csv(data_dir+window_dir+"/"+file_name))
            
vimeo_device_df = pd.concat(vimeo_device_dfs, axis=0, sort=False)
vimeo_region_df = pd.concat(vimeo_region_dfs, axis=0, sort=False)
vimeo_video_df = pd.concat(vimeo_video_dfs, axis=0, sort=False)
vimeo_date_df = pd.concat(vimeo_date_dfs, axis=0, sort=False)
vimeo_source_df = pd.concat(vimeo_source_dfs, axis=0, sort=False)

# conver `Project ID` "2QIK" to "TQIK" for now

In [307]:
vimeo_video_df['Project ID'] = vimeo_video_df['Project ID'].replace(to_replace={'2QIK':'TQIK'})

# Simple Code Estimator
So far this seems to be working best even though it is the most simple

In [311]:
def estimate_proj_code(video_name):
    potential_codes = []
    for ind, row in proj_codes_df.iterrows():
        code, proj = row['Project Code'], row['Formal Name']
        if proj.lower() in video_name.lower():
            potential_codes.append(code)
    return potential_codes

In [312]:
estimated_codes = vimeo_video_df['name'].apply(estimate_proj_code)

In [313]:
x = len(estimated_codes) - np.sum(len([1 for i in estimated_codes if len(i) > 0]))
print("Estimated %s codes out of %s" % (x, len(vimeo_video_df)))

Estimated 455 codes out of 3019


In [130]:
# TODO: Need to have a consistent way of double checking these against the project codes in the dataframe

# [Spacy PhraseMatcher](https://spacy.io/api/phrasematcher)
[Install Spacy](https://spacy.io/usage)

In [405]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import PhraseMatcher
phrase_matcher = PhraseMatcher(nlp.vocab)

In [406]:
def name_to_doc(name):
    return nlp(name.lower().translate(str.maketrans('', '', string.punctuation)))

In [407]:
for code, row in tqdm(proj_codes_df.iterrows(), total=len(proj_codes_df)):
    names = [name_to_doc(n) for n in row['Formal Name'].split('|')]
    print(names)
    phrase_matcher.add(code, None, *names)

HBox(children=(FloatProgress(value=0.0, max=86.0), HTML(value='')))

[kiam marcelo junio]
[the algebra project]
[afternoon snatch]
[ambivert]
[brave futures]
[brujos]
[bronx cunt tour]
[brown girls]
[black melodies]
[brand new boy]
[borderd]
[bsayf by roy kinsey]
[been there]
[code]
[the conspiracy theorist]
[damaged goods]
[darling shear]
[for better]
[filipino fusions]
[fame]
[full out]
[fobia]
[freaky phyllis]
[the furies]
[fck stan]
[futurewomen]
[fck yes]
[granny ballers
]
[the haven]
[the hoodoisie]
[hookups]
[i  love me]
[damaged goods]
[good enough]
[open tv]
[geetas guide to moving on]
[the hoodosie]
[hair story]
[hook ups]
[it goes unsaid]
[inertia]
[in real life]
[just call me ripley]
[kickin it]
[kings and queens]
[kissing walls
]
[lipstick city]
[let go and let god]
[low strung]
[michaela angela davis]
[movement matters]
[melody set me free]
[night night]
[nupita obama creates vogua]
[outtakes]
[on the verge]
[prep4love ,  p4l ,  dr every woman ,  one little pill
]
[project basho]
[pay day]
[public relations]
[philadelphia voices of pride]


In [408]:
# convert all the video titles to spacy objects
video_titles = []

for name in tqdm(vimeo_video_df['name'], total=len(vimeo_video_df)):
    video_titles.append(name_to_doc(name))

HBox(children=(FloatProgress(value=0.0, max=3019.0), HTML(value='')))




In [409]:
# get the matches for all video titles
matches = [phrase_matcher(vt) for vt in video_titles]

In [410]:
best_codes = []
multi_codes = []

for ind, match in tqdm(enumerate(matches), total=len(matches)):
    if len(match) == 1:
        match_id = match[0][0]
        proj_code = nlp.vocab.strings[match_id]
        best_codes.append(proj_code)
    elif len(match) > 1:
        match_ids = [m[0] for m in match]
        codes = [nlp.vocab.strings[match_id] for match_id in match_ids]
        if codes[0] == 'GEN':
            best_codes.append(codes[1])
        else:
            best_codes.append(None)
        multi_codes.append((ind, [nlp.vocab.strings[match_id] for match_id in match_ids]))
    else:
        best_codes.append(None)
    
codes = pd.Series(codes)
    

HBox(children=(FloatProgress(value=0.0, max=3019.0), HTML(value='')))




In [412]:
compare_rows = []
for ind, estimated_codes in tqdm(enumerate(best_codes), total=len(best_codes)):
    actual_code = vimeo_video_df['Project ID'].iloc[ind]
    title = vimeo_video_df['name'].iloc[ind]
    row = {'estimated_code':estimated_codes,
           'actual_code': actual_code,
           'matched': estimated_codes == actual_code,
           'title': title}
    compare_rows.append(row)
compare_df = pd.DataFrame(compare_rows)

HBox(children=(FloatProgress(value=0.0, max=3019.0), HTML(value='')))




In [416]:
compare_df['matched'].sum() / len(compare_df)

0.792315336204041

In [417]:
wrong_estimate_df = compare_df[compare_df['matched'] == False]

In [419]:
len(wrong_estimate_df)

627

In [418]:
wrong_estimate_df.head(14)

Unnamed: 0,estimated_code,actual_code,matched,title
12,,GEN,False,OTV Super Trailer - Cycle 4 2019
45,,GEN,False,OTV Post-Roll
57,GE,GEN,False,GOOD ENOUGH SEASON ONE _OFFICIAL TRAILER
62,GEN,KW,False,Open TV Re-Presents: Kissing Walls -- Pilot
63,,KW,False,Kissing Walls S02 EP1
67,,DG,False,Damaged Goods - Episode 1
79,,NOCV,False,#NupitaObama Creates Vogua Premiere: Wicker Park
81,,DG,False,Damaged Goods - Episode 2
82,,KW,False,Kissing Walls S02 EP6
85,,KW,False,Kissing Walls S02 EP4
