In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import streamlit as st
from datetime import datetime

In [2]:
df_agg = pd.read_csv('Aggregated_Metrics_By_Video.csv').iloc[1:,:]
df_agg.columns = ['Video','Video title','Video publish time','Comments added','Shares','Dislikes','Likes',
                    'Subscribers lost','Subscribers gained','RPM(USD)','CPM(USD)','Average % viewed','Average view duration',
                    'Views','Watch time (hours)','Subscribers','Your estimated revenue (USD)','Impressions','Impressions ctr(%)']
df_agg['Video publish time'] = pd.to_datetime(df_agg['Video publish time'], format='mixed')  # Change format to "%b %d, %Y" (abbreviated month)
df_agg['Average view duration'] = df_agg['Average view duration'].apply(lambda x: datetime.strptime(x,'%H:%M:%S'))
df_agg['Avg_duration_sec'] = df_agg['Average view duration'].apply(lambda x: x.second + x.minute*60 + x.hour*3600)
df_agg['Engagement_ratio'] =  (df_agg['Comments added'] + df_agg['Shares'] +df_agg['Dislikes'] + df_agg['Likes']) /df_agg.Views
df_agg['Views / sub gained'] = df_agg['Views'] / df_agg['Subscribers gained']
df_agg.sort_values('Video publish time', ascending = False, inplace = True)    
df_agg_sub = pd.read_csv('Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')
df_comments = pd.read_csv('Aggregated_Metrics_By_Video.csv')
df_time = pd.read_csv('Video_Performance_Over_Time.csv')
df_time['Date'] = pd.to_datetime(df_time['Date'], format='mixed')

In [3]:
df_agg_diff = df_agg.copy()
metric_date_12mo = pd.to_datetime(df_agg_diff['Video publish time'].max() - pd.DateOffset(months=12))

median_agg = df_agg_diff[df_agg_diff['Video publish time'] >= metric_date_12mo]

In [4]:
df_agg_diff

Unnamed: 0,Video,Video title,Video publish time,Comments added,Shares,Dislikes,Likes,Subscribers lost,Subscribers gained,RPM(USD),...,Average view duration,Views,Watch time (hours),Subscribers,Your estimated revenue (USD),Impressions,Impressions ctr(%),Avg_duration_sec,Engagement_ratio,Views / sub gained
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,1900-01-01 00:02:38,4383,192.5779,4,16.549,65130,2.95,158,0.080995,243.500000
187,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,12,2,3,78,1,1,1.882,...,1900-01-01 00:00:38,2401,25.9375,0,1.720,25094,2.64,38,0.039567,2401.000000
64,rEWPqw6rMGI,The Only Data Science Explanation You Need,2022-01-10,62,141,5,722,28,136,5.971,...,1900-01-01 00:04:40,10277,801.5549,108,60.498,215491,2.22,280,0.090493,75.566176
59,o-wsyxWbPOw,We Need to Talk About The LinkedIn Machine Lea...,2022-01-03,65,36,12,592,10,78,5.321,...,1900-01-01 00:02:46,11808,545.6332,68,62.568,166915,3.32,166,0.059705,151.384615
32,xpIFS6jZbe8,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,109,767,53,4413,46,2553,6.836,...,1900-01-01 00:04:29,79283,5945.5420,2507,528.286,1420968,3.31,269,0.067379,31.054837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,RRSRKf9eQxc,Should You Get A Masters in Data Science?,2018-11-14,56,41,10,276,2,81,7.398,...,1900-01-01 00:02:32,18488,782.5937,79,136.708,173610,8.40,152,0.020716,228.246914
190,IFceyuL6GZY,How I Became A Data Scientist From a Business ...,2018-11-12,11,33,4,168,0,81,4.419,...,1900-01-01 00:03:57,5515,363.4858,81,24.358,58816,5.72,237,0.039166,68.086420
204,Y_SMU701qlA,Predicting Season Long NBA Wins Using Multiple...,2018-07-10,7,45,2,159,1,34,2.883,...,1900-01-01 00:02:25,6863,276.7257,33,19.772,53865,4.03,145,0.031036,201.852941
138,qfRhKHV8-t4,Predicting Crypto-Currency Price Using RNN lST...,2017-11-18,28,114,18,247,1,111,1.326,...,1900-01-01 00:01:45,16558,487.2194,110,21.944,168508,5.65,105,0.024580,149.171171


In [5]:
metric_date_12mo

Timestamp('2021-01-17 00:00:00')

In [6]:
median_agg

Unnamed: 0,Video,Video title,Video publish time,Comments added,Shares,Dislikes,Likes,Subscribers lost,Subscribers gained,RPM(USD),...,Average view duration,Views,Watch time (hours),Subscribers,Your estimated revenue (USD),Impressions,Impressions ctr(%),Avg_duration_sec,Engagement_ratio,Views / sub gained
111,0jTtHYie3CU,Should You Be Excited About Web 3? (As a Data ...,2022-01-17,37,43,8,267,14,18,4.055,...,1900-01-01 00:02:38,4383,192.5779,4,16.549,65130,2.95,158,0.080995,243.5
187,2RWwN5ZT4tA,Should @Luke Barousse Take This Data Analyst ...,2022-01-14,12,2,3,78,1,1,1.882,...,1900-01-01 00:00:38,2401,25.9375,0,1.72,25094,2.64,38,0.039567,2401.0
64,rEWPqw6rMGI,The Only Data Science Explanation You Need,2022-01-10,62,141,5,722,28,136,5.971,...,1900-01-01 00:04:40,10277,801.5549,108,60.498,215491,2.22,280,0.090493,75.566176
59,o-wsyxWbPOw,We Need to Talk About The LinkedIn Machine Lea...,2022-01-03,65,36,12,592,10,78,5.321,...,1900-01-01 00:02:46,11808,545.6332,68,62.568,166915,3.32,166,0.059705,151.384615
32,xpIFS6jZbe8,How I Would Learn Data Science in 2022 (If I H...,2021-12-27,109,767,53,4413,46,2553,6.836,...,1900-01-01 00:04:29,79283,5945.542,2507,528.286,1420968,3.31,269,0.067379,31.054837
132,-zbLpoJVBMI,What the Heck is WSL 2? (My New Favorite Tool),2021-12-17,29,39,6,237,20,27,3.611,...,1900-01-01 00:02:25,5004,202.1937,7,18.005,132645,1.81,145,0.06215,185.333333
191,scSc6YSanQ0,How Statistics Saved the US SERIOUS $$$$ Durin...,2021-12-10,10,20,5,212,0,2,1.947,...,1900-01-01 00:00:43,3130,37.4755,2,2.484,59027,1.61,43,0.078914,1565.0
62,FqNpDNmpcEo,How Zillow Lost $500 MILLION With Machine Lear...,2021-12-03,62,309,4,869,16,142,7.098,...,1900-01-01 00:05:06,16933,1440.0058,126,119.891,312137,2.63,306,0.073466,119.246479
61,vwvdtXMcNzI,Why Is Data Engineering So HOT Right Now?,2021-11-26,63,174,9,786,16,278,4.452,...,1900-01-01 00:03:04,17268,883.2341,262,76.731,299897,2.94,184,0.059764,62.115108
24,2qVWurPFwfc,Is Data Science Dying?,2021-11-19,148,408,56,2378,34,981,5.272,...,1900-01-01 00:03:12,70043,3736.819,947,368.53,1071989,3.78,192,0.042688,71.399592


In [7]:
median_agg.sum()

TypeError: 'DatetimeArray' with dtype datetime64[ns] does not support reduction 'sum'

In [26]:
median_agg['Likes'].median()

382.5

In [28]:
df_agg_diff = df_agg.copy()
metric_date_12mo = df_agg_diff['Video publish time'].max() - pd.DateOffset(months=12)
median_agg = df_agg_diff[df_agg_diff['Video publish time'] >= metric_date_12mo]

numeric_cols = np.array((df_agg_diff.dtypes == 'float64') | (df_agg_diff.dtypes == 'int64'))
numeric_cols


array([False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [29]:
median_agg.columns

Index(['Video', 'Video title', 'Video publish time', 'Comments added',
       'Shares', 'Dislikes', 'Likes', 'Subscribers lost', 'Subscribers gained',
       'RPM(USD)', 'CPM(USD)', 'Average % viewed', 'Average view duration',
       'Views', 'Watch time (hours)', 'Subscribers',
       'Your estimated revenue (USD)', 'Impressions', 'Impressions ctr(%)',
       'Avg_duration_sec', 'Engagement_ratio', 'Views / sub gained'],
      dtype='object')

In [30]:
median_agg.drop(columns=['Video', 'Video title'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  median_agg.drop(columns=['Video', 'Video title'], axis=1, inplace=True)


In [31]:
median_agg.median()

Video publish time              2021-07-06 00:00:00
Comments added                                 43.5
Shares                                         42.5
Dislikes                                        5.0
Likes                                         382.5
Subscribers lost                               13.0
Subscribers gained                             56.5
RPM(USD)                                       4.37
CPM(USD)                                     10.573
Average % viewed                             41.175
Average view duration           1900-01-01 00:02:46
Views                                        7417.0
Watch time (hours)                         279.9851
Subscribers                                    38.5
Your estimated revenue (USD)                24.7995
Impressions                                155102.5
Impressions ctr(%)                             2.43
Avg_duration_sec                              166.0
Engagement_ratio                           0.060285
Views / sub 

In [8]:
df_agg = pd.read_csv('Aggregated_Metrics_By_Video.csv').iloc[1:,:]
df_agg.columns = ['Video','Video title','Video publish time','Comments added','Shares','Dislikes','Likes',
                    'Subscribers lost','Subscribers gained','RPM(USD)','CPM(USD)','Average % viewed','Average view duration',
                    'Views','Watch time (hours)','Subscribers','Your estimated revenue (USD)','Impressions','Impressions ctr(%)']
df_agg['Video publish time'] = pd.to_datetime(df_agg['Video publish time'])
df_agg['Average view duration'] = df_agg['Average view duration'].apply(lambda x: datetime.strptime(x,'%H:%M:%S'))
df_agg['Avg_duration_sec'] = df_agg['Average view duration'].apply(lambda x: x.second + x.minute*60 + x.hour*3600)
df_agg['Engagement_ratio'] =  (df_agg['Comments added'] + df_agg['Shares'] + df_agg['Dislikes'] + df_agg['Likes']) / df_agg.Views
df_agg['Views / sub gained'] = df_agg['Views'] / df_agg['Subscribers gained']
df_agg.sort_values('Video publish time', ascending = False, inplace = True)    

df_agg_sub = pd.read_csv('Aggregated_Metrics_By_Country_And_Subscriber_Status.csv')
df_comments = pd.read_csv('Aggregated_Metrics_By_Video.csv')
df_time = pd.read_csv('Video_Performance_Over_Time.csv')
df_time['Date'] = pd.to_datetime(df_time['Date'])

ValueError: time data "Nov 12, 2020" doesn't match format "%B %d, %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.