In [None]:
import pandas as pd
import numpy as np
import json
import re # Regular expresions

import nltk
nltk.download('punkt') 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from langdetect import detect

In [None]:
accepted_languages = ['es', 'en', 'it']

# Read data

In [None]:
# Open and decode the files
whayner_file = open('../scraper-data-whayner/data.json', encoding="utf8")
whayner_data = json.load(whayner_file)
whayner_dataframe = pd.DataFrame(whayner_data['videos'])

silvia_file = open('../scraper-data-silvia/data.json', encoding="utf8")
silvia_data = json.load(silvia_file)
silvia_dataframe = pd.DataFrame(silvia_data['videos'])

estefany_file = open('../scraper-data-estefany/data.json', encoding="utf8")
estefany_data = json.load(estefany_file)
estefany_dataframe = pd.DataFrame(estefany_data['videos'])

pedro_gomez_file = open('../scraper-data-pedrofelipe/data.json', encoding="utf8")
pedro_gomez_data = json.load(pedro_gomez_file)
pedro_gomez_dataframe = pd.DataFrame(pedro_gomez_data['videos'])

andres_file = open('../scraper-data-andres/data.json', encoding="utf8")
andres_data = json.load(andres_file)
andres_dataframe = pd.DataFrame(andres_data['videos'])

pedro_chaparro_file = open('../scraper-data-pedroandres/data.json', encoding="utf8")
pedro_chaparro_data = json.load(pedro_chaparro_file)
pedro_chaparro_dataframe = pd.DataFrame(pedro_chaparro_data['videos'])

# Concatenate in a single dataframe
complete_dataframe = pd.concat([whayner_dataframe, 
                               silvia_dataframe, 
                               estefany_dataframe, 
                               pedro_gomez_dataframe, 
                               andres_dataframe,
                               pedro_chaparro_dataframe])

# Reset index
complete_dataframe = complete_dataframe.reset_index(drop=True)


# print(complete_dataframe.shape)

In [None]:
# complete_dataframe.head()

In [None]:
# complete_dataframe.tail()

In [None]:
#complete_dataframe.to_json('./initial.json', force_ascii=False, orient='table', index=False)

# Convert to lowercase letters

In [None]:
for dataframe_index in complete_dataframe.index: 
    title = complete_dataframe['title'][dataframe_index]
    description = complete_dataframe['description'][dataframe_index]
    tags = complete_dataframe['tags'][dataframe_index]
    
    complete_dataframe['title'][dataframe_index]= title.lower()
    complete_dataframe['description'][dataframe_index] = description.lower()
    complete_dataframe['tags'][dataframe_index] = tags.lower()

# Remove duplicated entries

In [None]:
print(complete_dataframe.shape)
complete_dataframe.drop_duplicates(subset=['url'], keep='last', inplace=True, ignore_index=True)
print(complete_dataframe.shape)

# Remove links 

In [None]:
for dataframe_index in complete_dataframe.index: 
    title = complete_dataframe['title'][dataframe_index]
    description = complete_dataframe['description'][dataframe_index]
    # Replace strings that starting with http or www with ''
    title = re.sub(r'http\S+', '', title, flags=re.MULTILINE)
    description = re.sub(r'http\S+', '', description, flags=re.MULTILINE) 
    complete_dataframe['title'][dataframe_index] =  re.sub(r'www\S+', '', title, flags=re.MULTILINE)
    complete_dataframe['description'][dataframe_index] = re.sub(r'www\S+', '', description, flags=re.MULTILINE) 

# Remove unwanted characters (All that is not alfanum)

In [None]:
for dataframe_index in complete_dataframe.index: 
    title = complete_dataframe['title'][dataframe_index]
    description = complete_dataframe['description'][dataframe_index]
    tags = complete_dataframe['tags'][dataframe_index]

    # Remove not alfanumeric chars (with exceptions)
    new_title = re.sub(r'[^a-zA-Z0-9ñÑáéíóúÁÉÍÓÚ ]+', '', title)
    new_description = re.sub(r'[^a-zA-Z0-9ñÑáéíóúÁÉÍÓÚ ]+', '', description)
    new_tags = re.sub(r'[^a-zA-Z0-9ñÑáéíóúÁÉÍÓÚ, ]+', '', tags)
    
    # Remove redundant spaces
    new_title = re.sub(' +', ' ', new_title).strip()
    new_description = re.sub(' +', ' ', new_description).strip()
    new_tags = re.sub(' +', ' ', new_tags).strip()

    # Replace
    complete_dataframe['title'][dataframe_index] = new_title
    complete_dataframe['description'][dataframe_index] = new_description
    complete_dataframe['tags'][dataframe_index] = new_tags

# Remove entries whose language is not english or spanish

In [None]:
#removed = []

# Create a copy of original dataframe
df = complete_dataframe.copy()

#for dataframe_index in complete_dataframe.index: 
    #video_texts = complete_dataframe['description'][dataframe_index] + complete_dataframe['title'][dataframe_index] + complete_dataframe['tags'][dataframe_index]
    #language = detect(video_texts)
    
    #if(language not in accepted_languages):
        # print(language)
        # print(complete_dataframe['url'][dataframe_index])
        #removed.append({'lang': language, 'url': complete_dataframe['url'][dataframe_index]})
        #df = df.drop(dataframe_index)

#print(removed)

In [None]:
#complete_dataframe.shape

In [None]:
#df.shape

# Save as json

In [None]:
#df.to_json('./data.json', force_ascii=False, orient='table', index=False)

# Counting null values - Deleting videos with empty titles - Removing repeated tag

In [None]:
#pd.set_option('display.max_rows', None)

# create a copy of last version of complete_dataframe
update_df = df.copy()

# find indexes where there is just an empty string
indexes = update_df[update_df['title'] == ''].index.tolist()
# Counting null data in each column
print(update_df.isnull().sum())

print("shape before drop:", update_df.shape)
print("indexes with empty titles:", indexes)
update_df = update_df.drop(index = indexes)
print("original shape:", complete_dataframe.shape, "updated shape:", update_df.shape)

#checking if there is another empty title in the updated
print("check if there is empty titles:", len(update_df[update_df['title'] == ''].index))

# reseting the index
update_df = update_df.reset_index(drop=True)

# remove tag that contains: 'video, compartir, teléfono con cámara, teléfono con video, gratuito, subir'
update_df.loc[update_df['tags'] == 'video, compartir, teléfono con cámara, teléfono con video, gratuito, subir', 'tags'] = ''

# Deleting repeated words in tags 

In [None]:
for index in update_df['tags'].index:
    # take row, separate by ',' and remove space (strip) before and after each (map) word
    original = map(str.strip, update_df['tags'][index].split(','))
    unique_words = set(original)
    if '' in unique_words:
        unique_words.remove('')
    # join words in unique_words, leaving an space between them, updating de row
    update_df['tags'][index] = ', '.join(unique_words)

# Update data.json

In [None]:
update_df.to_json('./data.json', force_ascii=False, orient='table', index=False)

# Remove stopwords in titles and descriptions

In [None]:
# function to select which set of stopwords choose
def selecting_stopwords(detected_lang):
    if detected_lang == 'es':
        return set(stopwords.words("spanish"))
    elif detected_lang == 'en' or detected_lang == 'it':
        return set(stopwords.words("english"))

# function to keep stopwords in a list and later join the inner words
def removing_stopwords(sentence, detected_lang):
    no_stopw = []
    stopword = []
    [stopword.append(word) if word in selecting_stopwords(detected_lang) else no_stopw.append(word) for word in nltk.word_tokenize(sentence)]
    return ' '.join(no_stopw)

nostopwords = update_df.copy()

for df_index in nostopwords.index:
    for column in 'title', 'description':
        # we need try except when description is empty
        try:
            detected_lang = detect(nostopwords[column][df_index])
            sentence = nostopwords[column][df_index]
            # data treatment if the detect_lang is not en, sp or it
            if(detected_lang not in accepted_languages):
                first_clean = removing_stopwords(sentence, 'es')
                nostopwords[column][df_index] = removing_stopwords(first_clean, 'en')
            else:
                nostopwords[column][df_index] = removing_stopwords(sentence, detected_lang)
        except:
            #print(nostopwords[column][df_index], df_index)
            continue

# Saving json without stopwords

In [None]:
nostopwords.to_json('./data_nostopwords2.json', force_ascii=False, orient='table', index=False)

In [None]:
update_df['url'].nunique()

In [None]:
update_df.shape