# Sentiment analysis notebook

To run, scroll to the section at the bottom, edit the parameters and then run the entire notebook.

### Importing packages and defining pre-requesites

In [None]:
# Importing packages
import numpy as np
import pandas as pd
import string
from typing import List, Dict, Set
import matplotlib.pyplot as plt
import seaborn; seaborn.set
from pandas.plotting import register_matplotlib_converters; register_matplotlib_converters()
import urllib
import os
import csv
import time
import math
import pytz
from sklearn.linear_model import LinearRegression
import matplotlib.ticker as mtick
from progress.bar import IncrementalBar
import time, sys
import datetime
from IPython.display import clear_output

# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import os

### Reading scraped data in to a DataFrame, and tagging brands

In [None]:
# Function to read the scraped YT data in to a nicely formatted DataFrame
def return_data(directory):
    df = pd.DataFrame(columns=['date', 'vid_id', 'author', 'comment'])
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if filename[-4:] == '.csv':
                with open(os.path.join(directory, filename), 'r') as f:
                    reader = csv.reader(f)
                    df_temp = pd.DataFrame(list(reader))
                df_temp = df_temp.fillna('')
                # The below needs changing - the CSVs proved difficult to deal with so this is a bodged fix
                df_temp['Comment'] = df_temp[list(range(7, max(df_temp.columns)+1))].agg(' '.join, axis=1)
                z = pd.Series([s[:1] == '2' for s in df_temp[5]])
                df_temp = df_temp[z]
                df_temp_2 = pd.DataFrame()
                df_temp_2['date'] = pd.to_datetime(df_temp[5][1:]).dt.tz_localize(None)
                df_temp_2['comment'] = df_temp['Comment'][1:]
                df_temp_2['vid_id'] = df_temp[0][1:]
                df_temp_2['author'] = df_temp[4][1:]
                df = df.append(df_temp_2, sort=True)
    df = df.drop_duplicates().sort_values(by='date').reset_index(drop=True)
    return df

# Function to collect data from all channels in to one DataFrame
# folder_loc should be a folder containing a folder for each channel, inside of which should be the scraped comments
def tagged_comm_all_channels(folder_loc, start_date, end_date, brands=[]):
    print('Collecting comments in to a dataframe and tagging brands', '\n')
    output_df = pd.DataFrame(columns=['date', 'vid_id', 'author', 'comment', 'channel'])

    t_0 = time.time()

    for root, dirs, files in os.walk(folder_loc):
        for name in dirs:
            if name != 'YouTube comments':
                print(f'Starting: {os.path.join(root, name)}')
                channel_df = return_data(os.path.join(root, name))
                channel_df['channel'] = [name.replace('_', ' ') for _ in range(len(channel_df))]
                output_df = output_df.append(channel_df, sort=False)
                print('Folder complete \n')

    output_df['comment_length'] = [len(c) for c in output_df['comment']]
    output_df = output_df[['date', 'channel', 'vid_id', 'author', 'comment', 'comment_length']]
    output_df = output_df[(output_df['date'] >= pd.datetime(*start_date)) & (output_df['date'] <= pd.datetime(*end_date))]
    output_df['tags'] = [', '.join([brand for brand in brands if brand.lower() in comment.lower()]) for comment in output_df['comment']]
    
    output_df.to_csv(f'comments_{start_date}_to_{end_date}.csv', index=False)
    
    print(f'Done after {round((time.time() - t_0), 2)} seconds.')
    
    return output_df.sort_values(by='date')


### Sentiment analysis with the Google NLP API

In [None]:
# Function to classify the sentiment of a string. Outputs the score and magnitude, unless print_results is True.
def classify(string, print_results=False):
    document = types.Document(content=string, type=enums.Document.Type.PLAIN_TEXT)
    
    # Instantiates a client
    client = language.LanguageServiceClient()

    # Detects the sentiment of the text
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    
    # Outputs the results
    if not print_results:
        return (sentiment.score, sentiment.magnitude)
    else:
        print(f'Text: {string}')
        print(f'Sentiment: {sentiment.score}, {sentiment.magnitude} \n')
        
        
# Seconds to hh mm ss function
def display_time_passed(secs):
    print(f"{secs//60**2} hours {(secs%60**2)//60} minutes {secs%60} seconds")
    
    
# Function converting a tuple to a string
def to_date_string(date_tuple):
    date_list = [str(a).zfill(2) for a in list(date_tuple)]
    return '-'.join(date_list)


# Progress bar function
def update_progress(time_start, comments_done, comments_total, string=""):
    progress = comments_done / comments_total
    bar_length = 30
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(string)
    print(text)
    print(f"Comments analysed: {comments_done} of {comments_total}")
    print(f"Time elapsed: {round(time.time() - time_start,1)} seconds")
    
    
# Function to update a dataframe with sentiment score and magnitude.
def sentiment_analysis_df(df, save_loc, debug=False):
    print('\n\n',  'Analysing sentiment of tagged channels', '\n')
    time_start = time.time()
    initial_string = f"Start time: {datetime.datetime.fromtimestamp(time_start).strftime('%Y-%m-%d %H:%M:%S')}"
    print(initial_string)
    error_dict = {}
    
    df_new = df[df['tags']!='']
    df_new['sentiment_score'] = np.zeros(len(df_new))
    df_new['sentiment_magnitude'] = np.zeros(len(df_new))

    tagged_comments_length = len(df_new)
    
    for i in range(tagged_comments_length):
        try:
            score, magnitude = classify(df_new['comment'].iloc[i])
            df_new['sentiment_score'].iloc[i] = score
            df_new['sentiment_magnitude'].iloc[i] = magnitude
        except Exception as err:
            error_dict[i] = err

        if not debug:
            update_progress(time_start, i, tagged_comments_length, initial_string)
        else:
            print(f'Finished {i} of {tagged_comments_length}')
        
        if i%30 == 0:
            df_new.to_csv(os.path.join(save_loc, "temp_save_file.csv"))
    
    if not debug:
        update_progress(time_start, tagged_comments_length, tagged_comments_length, initial_string)
        
    df_new.to_csv(os.path.join(save_loc, "temp_save_file.csv"))
    df_new.to_csv(os.path.join(save_loc, f"comments_sentiment_{to_date_string(start_date)}_to_{to_date_string(end_date)}.csv"), index=False)
    
    print("\nCompleted", (error_dict!={})*"with errors:")
    for key, val in error_dict.items():
        print(f"- Error on comment {key}: {val}")


### Run
Edit the below parameters before running.

In [None]:
# Credentials are set here. This must be changed before running.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="CREDENTIALS.json"

# Set the dates between which comments will be analysed
start_date, end_date = (2020, 2, 1), (2020, 4, 30)

# Folder location in which comment .csv files are saved, separated in to folders for each channel
parent_folder_loc = 'COMMENTS LOCATION HERE'

# Brands to look at specifically
brands_feb_to_apr = ['Amp Human', 'Bell', 'Bentonville', 'Douchebags', 'On Running', 'Polar', 'Pole', 'Quarq', 'Shimano', 'Zipp', 'SRAM']

# Temporary location to save results, useful if script crashes mid-way through analysis
comments_save_location = 'SAVE LOCATION HERE'

In [None]:
# Build tagged comments df and csv
comments_df_new = tagged_comm_all_channels(parent_folder_loc, start_date, end_date, brands_feb_to_apr)

In [None]:
# Analyse comment sentiment for brands
sentiment_analysis_df(comments_df_new, comments_save_location, True)