In [1]:
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk



In [2]:
class DataCleaner:
    def __init__(self, df):
        self.df = df
        self.column_scope = ['comment_count', 'dislike_count', 'like_count', 'view_count', 'licensed_content',
                             'duration_sec', 'video_category_label', 'video_title', 'video_description', 'published_at',
                             'definition', 'duration']
    def clean_desc(self, desc):
        desc = desc.split('#BBC')[0]
        desc = re.split(r' https://bbc\.in\w*', desc)[-1]
        return desc
        
    def clean_licensed(self, lin):
        # T/F instead of 1/nan
        try:
            int(lin)
            return True
        except ValueError:
            if type(lin) is float:
                return False

    def parse_duration(self, duration_str):
        duration_str = duration_str[2:]
        total_seconds = 0
        if 'H' in duration_str:
            hours, duration_str = duration_str.split('H')
            total_seconds += int(hours) * 3600
        if 'M' in duration_str:
            minutes, duration_str = duration_str.split('M')
            total_seconds += int(minutes) * 60
        if 'S' in duration_str:
            seconds = duration_str.rstrip('S')
            total_seconds += int(seconds)
        return total_seconds
        
    def clean(self):
        # Get only valid columns
        self.df = self.df[self.column_scope]
        # Clean video desc
        self.df['video_description_cleaned'] = self.df['video_description'].apply(cleaner.clean_desc)
        # Change licensced content format to bool
        self.df['licensed_content'] = self.df['licensed_content'].apply(cleaner.clean_licensed)
        # Change published at type to datetime
        self.df['published_at'] = self.df['published_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
        # Parsing duration
        self.df['duration'] = self.df['duration'].apply(self.parse_duration)
        # DF description
        
        return self.df


In [3]:
class BBC:
    def __init__(self, df):
        self.df = df
        nltk.download('stopwords')
        nltk.download('punkt')
        self.stop_words = set(stopwords.words('english'))
        self.stop_words = self.stop_words.union({'bbc', 'two', 'one', 'three',
                              'part', 'series', 'episode', 'preview', 'show'})

    def data_types(self):
        for column in list(self.df.columns):
            print(f'column name: {column},\n column values types: {self.df[column].apply(type).unique()},\n'
                  f' representative value: {self.df[column].unique()[1]}\n###################\n')

    def date_check(self):
        newest_date, oldest_date = self.df['published_at'].max().year, self.df['published_at'].min().year
        print(f'newest article (YYYY): {newest_date},'
              f'oldest article (YYYY): {oldest_date}')
        self.df['published_parsed'] = self.df['published_at'].dt.year

    def top_categories(self):
        top_occurrences = self.df['video_category_label'].value_counts().head(5)
        colors = ['#004c6d', '#00587a', '#00668e', '#0074a2',  '#0083b6',
                  '#0091c9', '#00a0dd', '#00aee1', '#00bcf4', '#00cbff']
        plt.figure(figsize=(8, 8))
        patches, texts, autotexts = plt.pie(top_occurrences, labels=top_occurrences.index, autopct='%1.1f%%', colors=colors)
        for autotext in autotexts:
            autotext.set_color('white')
        plt.title('Top 5 Occurrences')
        plt.show()

    def clean_title(self):
        def _clean_title(title):
            title = title.translate(str.maketrans('', '', string.punctuation))
            title = re.sub(r'\d+', '', title)
            tokens = word_tokenize(title)
            tokens = [word.lower() for word in tokens if word.lower() not in self.stop_words]
            return tokens
        self.df['video_title_clean'] = self.df['video_title'].apply(_clean_title)

    def top_title_keywords(self):
        _df = self.df
        grouped = _df.groupby('published_parsed')['video_title_clean'].sum()
        keyword_counts = grouped.apply(Counter)
        top_keywords_by_year = keyword_counts.apply(lambda x: [word for word, _ in x.most_common(5)])
        print(top_keywords_by_year)
        top_keywords_by_year = keyword_counts.apply(lambda x: [word for word, _ in x.most_common(2)])
        keyword_counts = top_keywords_by_year.explode().groupby(level=0).value_counts().unstack(fill_value=0)
        keyword_counts.plot(kind='bar', stacked=True, figsize=(10, 6))
        plt.title('Top 2 Keywords in Titles Each Year')
        plt.xlabel('Year')
        plt.ylabel('Number of Occurrences')
        plt.legend(title='Keywords')
        plt.xticks(rotation=45)
        plt.show()
        del _df


    def engagement_rate(self):
        self.df['engagement_rate'] = ((self.df['like_count'] + self.df['comment_count'] + self.df['dislike_count']) / self.df['view_count']) * 100
        self.df['engagement_rate'] = self.df['engagement_rate'].round(1)

    def title_len(self):
        self.df['title_len'] = self.df['video_title_clean'].apply(lambda x: sum(len(word) for word in x))

    def dichotomized_engagement(self):
        median_rate = self.df['engagement_rate'].median()
        self.df['dichotomized_engagement'] = (self.df['engagement_rate'] >= median_rate).astype(int)

    def encode2numeric(self, *args):
        for column in args:
            self.df[column], _ = pd.factorize(self.df[column])

    def visualise_correlations(self):
        correlation_matrix = self.df[
            ['definition', 'duration', 'dichotomized_engagement', 'published_parsed', 'engagement_rate', 'title_len',
             'video_category_label']].corr()
        
        plt.figure(figsize=(10, 8))
        plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
        plt.colorbar()
        plt.title('Correlation Matrix of BBC YouTube Videos Metadata')
        plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
        plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
        plt.show()

        plt.figure(figsize=(12, 8))
        for i, col in enumerate(['duration', 'engagement_rate', 'title_len']):
            plt.subplot(2, 2, i + 1)
            plt.scatter(self.df[col], self.df['dichotomized_engagement'], alpha=0.5)
            plt.xlabel(col)
            plt.ylabel('Dichotomized Score')
            plt.title(f'Scatter plot: Dichotomized Score vs {col}')
        plt.tight_layout()
        plt.show()

    def run(self):
        self.data_types()
        self.date_check()
        self.top_categories()
        
        self.clean_title()
        self.top_title_keywords()
        self.engagement_rate()
        self.title_len()
        self.dichotomized_engagement()
        self.encode2numeric('video_category_label', 'definition')
        self.visualise_correlations()
        self.df.drop(columns=['published_at', 'licensed_content', 'duration'], inplace=True)
        # IMO 'licensed_content' and 'duration' may bring valuable correlations while analyzing the dataset
        # ... but im dropping them since thats the task (:
        

In [6]:
df = pd.read_csv('bbc.csv')

print('###########\n', df.head(), '\n')  # Display the first few rows of the DataFrame
print('###########\n', df.info(), '\n')   # Display information about the DataFrame
print('###########\n', df.describe(), '\n')
cleaner = DataCleaner(df)
cleaned_df = cleaner.clean()
bbc = BBC(cleaned_df)
bbc.run()

###########
    position                channel_id channel_title     video_id  \
0         1  UCCj956IF62FbT7Gouszaj9w           BBC  8qH0pGdjB_U   
1         2  UCCj956IF62FbT7Gouszaj9w           BBC  lqeS-rOoBSw   
2         3  UCCj956IF62FbT7Gouszaj9w           BBC  JMfkBavl1ks   
3         4  UCCj956IF62FbT7Gouszaj9w           BBC  T_6RRmkLOSs   
4         5  UCCj956IF62FbT7Gouszaj9w           BBC  3-mayD_9Yg8   

           published_at                                        video_title  \
0  2020-08-13T15:00:02Z  Colin Robinson's Origins of the Species - What...   
1  2020-08-13T14:30:04Z  Maisie Smith and Zack Morris on EastEnders' la...   
2  2020-08-13T05:50:21Z  A-level results to arrive in year with no exam...   
3  2020-08-12T13:00:13Z  8 signs you're in survival mode and how to sta...   
4  2020-08-12T11:00:02Z  The secret Heathrow lounge that costs £2700 ju...   

                                   video_description  video_category_id  \
0  Subscribe and 🔔 to OFFICIAL BBC

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['video_description_cleaned'] = self.df['video_description'].apply(cleaner.clean_desc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['licensed_content'] = self.df['licensed_content'].apply(cleaner.clean_licensed)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['published_at']

<class 'LookupError'>: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/pyodide/nltk_data'
    - '/nltk_data'
    - '/share/nltk_data'
    - '/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [9]:
# The correlation graphs show that the most egagement is coming from short videos, but the really long ones also grab attentions. worst
# While title len and category doesnt seem to play a role in a bigger time scope, and we can safely say that people dont care about definition. 
# Most engagement falls within close range from video to video with few excpetions which egagement is "breaking the celling"

In [None]:
# Better correlation could be derived from analyzing description