In [34]:
import pandas as pd
import numpy as np
import json
import re # Regular expresions

from langdetect import detect

# Read data

In [35]:
# Open and decode the files
whayner_file = open('../scraper-data-whayner/data.json', encoding="utf8")
whayner_data = json.load(whayner_file)
whayner_dataframe = pd.DataFrame(whayner_data['videos'])

silvia_file = open('../scraper-data-silvia/data.json', encoding="utf8")
silvia_data = json.load(silvia_file)
silvia_dataframe = pd.DataFrame(silvia_data['videos'])

estefany_file = open('../scraper-data-estefany/data.json', encoding="utf8")
estefany_data = json.load(estefany_file)
estefany_dataframe = pd.DataFrame(estefany_data['videos'])

pedro_gomez_file = open('../scraper-data-pedrofelipe/data.json', encoding="utf8")
pedro_gomez_data = json.load(pedro_gomez_file)
pedro_gomez_dataframe = pd.DataFrame(pedro_gomez_data['videos'])

andres_file = open('../scraper-data-andres/data.json', encoding="utf8")
andres_data = json.load(andres_file)
andres_dataframe = pd.DataFrame(andres_data['videos'])

pedro_chaparro_file = open('../scraper-data-pedroandres/data.json', encoding="utf8")
pedro_chaparro_data = json.load(pedro_chaparro_file)
pedro_chaparro_dataframe = pd.DataFrame(pedro_chaparro_data['videos'])

# Concatenate in a single dataframe
complete_dataframe = pd.concat([whayner_dataframe, 
                               silvia_dataframe, 
                               estefany_dataframe, 
                               pedro_gomez_dataframe, 
                               andres_dataframe,
                               pedro_chaparro_dataframe])

# Reset index
complete_dataframe = complete_dataframe.reset_index(drop=True)


# print(complete_dataframe.shape)

In [36]:
# complete_dataframe.head()

In [37]:
# complete_dataframe.tail()

In [38]:
complete_dataframe.to_json('./initial.json', force_ascii=False, orient='table', index=False)

# Remove links 

In [39]:
for dataframe_index in complete_dataframe.index: 
    title = complete_dataframe['title'][dataframe_index]
    description = complete_dataframe['description'][dataframe_index]
    # Replace strings that starting with http with ''
    complete_dataframe['title'][dataframe_index] =  re.sub(r'http\S+', '', title, flags=re.MULTILINE)
    complete_dataframe['description'][dataframe_index] = re.sub(r'http\S+', '', description, flags=re.MULTILINE) 

# Remove unwanted characters (All that is not alfanum)

In [40]:
for dataframe_index in complete_dataframe.index: 
    title = complete_dataframe['title'][dataframe_index]
    description = complete_dataframe['description'][dataframe_index]

    # Remove not alfanumeric chars (with exceptions)
    new_title = re.sub(r'[^a-zA-Z0-9ñÑáéíóúÁÉÍÓÚ.:&¿?/+$\- ]+', '', title)
    new_description = re.sub(r'[^a-zA-Z0-9ñÑáéíóúÁÉÍÓÚ.:&¿?/+$\- ]+', '', description)
    
    # Remove redundant underscores, dots and some others
    new_title = re.sub('_+', '_', new_title)
    new_description = re.sub('_+', '_', new_description)
    new_title = re.sub('\.+', '.', new_title)
    new_description = re.sub('\.+', '.', new_description) 
    new_title = re.sub('\-+', '-', new_title)
    new_description = re.sub('\-+', '-', new_description) 
    
    # Remove redundant spaces
    new_title = re.sub(' +', ' ', new_title).strip()
    new_description = re.sub(' +', ' ', new_description).strip()

    # Replace
    complete_dataframe['title'][dataframe_index] = new_title
    complete_dataframe['description'][dataframe_index] = new_description

# Remove entries whose language is not english or spanish

In [41]:
accepted_languages = ['es', 'en', 'it']

# Create a copy of original dataframe
df = complete_dataframe.copy()

for dataframe_index in complete_dataframe.index: 
    video_texts = complete_dataframe['description'][dataframe_index] + complete_dataframe['title'][dataframe_index] + complete_dataframe['tags'][dataframe_index]
    language = detect(video_texts)
    
    # Manual verification before delete it
    if(language not in accepted_languages):
        # print(language)
        # print(complete_dataframe['url'][dataframe_index])
        df = df.drop(dataframe_index)

In [42]:
complete_dataframe.shape

(12849, 5)

In [43]:
df.shape

(12613, 5)

# Save as json

In [44]:
df.to_json('./data.json', force_ascii=False, orient='table', index=False)