In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### 1. Use TLD for selection of relevant tables

**table_statistics** files contain the hosts of the datasets. Only tables with possibly english domains are considered.

In [1]:
!pip install fasttext



In [2]:
import pandas as pd
import os
import fasttext
import progressbar
import json
import gzip
import shutil
fasttext.FastText.eprint = lambda x: None # avoid Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.


In [5]:
path_parent = os.path.dirname(os.getcwd())

In [5]:
data_path = os.path.join(path_parent, 'content/drive/My Drive/1_Studium/2_Master/3_Sem/Team Project/Data/')

In [3]:
data_path = ('/Users/estelleweinstock/Google Drive/1_Studium/2_Master/3_Sem/Team Project/Data/')

In [4]:
def remove_irrelevant_tlds():
    """
    moves all files with valid tlds to a new path called "cleaned"
    :return:
    """
    files = [file for file in os.listdir(top_100_path) if file.endswith('.json.gz')]

    valid_tld = ['.com', '.net', '.org', '.uk']
    valid_files = []
    file_valid = 'false'

    for file in files:
        #rint(file)
        file_valid = 'false'
        for tld in valid_tld:
            if tld in file:
                file_valid = 'true'
                #print('true')
        if file_valid == 'true':
            valid_files.append(file)
            # copy only files with valid tlds to cleaned path
        #    shutil.copy(os.path.join(top_100_path, '{}'.format(file)), cleaned_top_100_path, follow_symlinks=True)
        else:
          os.remove(os.path.join(top_100_path, '{}'.format(file)))
          #print(file, 'removed')

    print(valid_files)
    print('Number of files:', len(files))
    print('Number of valid files:',len(valid_files))


### 2. Apply language detection algorithm to rows of the tables

In [5]:
def remove_with_fasttext():
    """
    reads all files from cleaned data path and removes non-english products from the data tables
    :return:
    """
    
    #pretrained_fasttext_path = '/content/drive/My Drive/1_Studium/2_Master/3_Sem/Team Project/Code/lid.176.bin'
    pretrained_fasttext_path ='/Users/estelleweinstock/Google Drive/1_Studium/2_Master/3_Sem/Team Project/Code/lid.176.bin'
    
    model = fasttext.load_model(pretrained_fasttext_path)

    # list all top 100 product files
    files = [file for file in os.listdir(top_100_path) if file.endswith('.json.gz')]

    for file in files:
        #print(file)
        df = pd.read_json(os.path.join(top_100_path, '{}'.format(file)), compression='gzip', lines=True)

        #iterrate over rows and save row_ids of english products
        english_products = []
        count = 0
        with progressbar.ProgressBar(max_value=df.shape[0]) as bar:
            for i in range(df.shape[0]):  # iterate over rows
                row_id = df['row_id'][i]
                for j in range(df.shape[1]): # iterate over columns
                    if df.columns[j].lower() == 'brand': # exclude brand column
                        cell_pred = 'None'
                    else:
                        cell = df.iat[i,j]
                        if type(cell) == str:
                            cell_pred = model.predict([cell])[0]
                        elif type(cell) == list:
                            if type(cell[0]) == str:
                                cell_pred = model.predict(cell)[0]
                            else:
                                cell_pred = 'None'
                        else:
                            cell_pred = 'None'
                    if cell_pred == 'None' or cell_pred == [['__label__en']]:
                        english = 'True'
                    else:
                        english = 'False'
                        break
                if english == 'True':
                    english_products.append(row_id)
                count += 1
                bar.update(count)

        # design new dataframe with english products only
        df_cleaned = df[df['row_id'].isin(english_products)]

        # write to gzip compressed json file
        df_cleaned.to_json(os.path.join(top_100_path, '{}'.format(file)), compression='gzip', orient='records', lines=True)


In [10]:
# this works for not colab
def remove_with_fasttext():
    """
    reads all files from cleaned data path and removes non-english products from the data tables
    :return:
    """
    
    #pretrained_fasttext_path = '/content/drive/My Drive/1_Studium/2_Master/3_Sem/Team Project/Code/lid.176.bin'
    pretrained_fasttext_path ='/Users/estelleweinstock/Google Drive/1_Studium/2_Master/3_Sem/Team Project/Code/lid.176.bin'
    
    model = fasttext.load_model(pretrained_fasttext_path)

    # list all top 100 product files
    files = [file for file in os.listdir(top_100_path) if file.endswith('.json.gz')]

    for file in files:
        #print(file)
        df = pd.read_json(os.path.join(top_100_path, '{}'.format(file)), compression='gzip', lines=True)

        #iterrate over rows and save row_ids of english products
        english_products = []
        count = 0
        #with progressbar.ProgressBar(max_value=df.shape[0]) as bar:
        try:
            for i in range(df.shape[0]):  # iterate over rows
                row_id = df['row_id'][i]
                for j in range(df.shape[1]): # iterate over columns
                    if df.columns[j].lower() == 'brand': # exclude brand column
                        cell_pred = 'None'
                    else:
                        cell = df.iat[i,j]
                        if type(cell) == str:
                            cell_pred = model.predict([cell])[0]
                        elif type(cell) == list:
                            if type(cell[0]) == str:
                                cell_pred = model.predict(cell)[0]
                            else:
                                cell_pred = 'None'
                        else:
                            cell_pred = 'None'
                    if cell_pred == 'None' or cell_pred == [['__label__en']]:
                        english = 'True'
                    else:
                        english = 'False'
                        break
                if english == 'True':
                    english_products.append(row_id)
                count += 1
                    #bar.update(count)

            # design new dataframe with english products only
            df_cleaned = df[df['row_id'].isin(english_products)]

            # write to gzip compressed json file
            df_cleaned.to_json(os.path.join(top_100_path, '{}'.format(file)), compression='gzip', orient='records', lines=True)
        except:
            pass

In [None]:
#Radio
#top_100_path = os.path.join(data_path, 'RadioStation/RadioStation_top100')
#top_100_path = os.path.join(data_path, 'RadioStation/RadioStation_minimum3')
#top_100_path = os.path.join(data_path, 'RadioStation/RadioStation_rest')

In [None]:
# LandmarksOrHistoricalBuildings
#top_100_path = os.path.join(data_path, 'LandmarksOrHistoricalBuildings/LandmarksOrHistoricalBuildings_top100')
#top_100_path = os.path.join(data_path, 'LandmarksOrHistoricalBuildings/LandmarksOrHistoricalBuildings_minimum3')
#top_100_path = os.path.join(data_path, 'LandmarksOrHistoricalBuildings/LandmarksOrHistoricalBuildings_rest')

In [None]:
# Library
#top_100_path = os.path.join(data_path, 'Library/Library_top100')
#top_100_path = os.path.join(data_path, 'Library/Library_minimum3')
#top_100_path = os.path.join(data_path, 'Library/Library_rest')

In [None]:
# SportsTeam
#top_100_path = os.path.join(data_path, 'SportsTeam/SportsTeam_top100')
#top_100_path = os.path.join(data_path, 'SportsTeam/SportsTeam_minimum3')
#top_100_path = os.path.join(data_path, 'SportsTeam/SportsTeam_rest')

In [None]:
# GovernmentOrganization
#top_100_path = os.path.join(data_path, 'GovernmentOrganization/GovernmentOrganization_top100')
#top_100_path = os.path.join(data_path, 'GovernmentOrganization/GovernmentOrganization_minimum3')
#top_100_path = os.path.join(data_path, 'GovernmentOrganization/GovernmentOrganization_rest')

In [None]:
# Dataset
#top_100_path = os.path.join(data_path, 'Dataset/Dataset_top100')
#top_100_path = os.path.join(data_path, 'Dataset/Dataset_minimum3')
#top_100_path = os.path.join(data_path, 'Dataset/Dataset_rest')

In [None]:
# TVEpisode
#top_100_path = os.path.join(data_path, 'TVEpisode/TVEpisode_top100')
#top_100_path = os.path.join(data_path, 'TVEpisode/TVEpisode_minimum3')
#top_100_path = os.path.join(data_path, 'TVEpisode/TVEpisode_rest')

In [None]:
# CollegeOrUniversity
#top_100_path = os.path.join(data_path, 'CollegeOrUniversity/CollegeOrUniversity_top100')
#top_100_path = os.path.join(data_path, 'CollegeOrUniversity/CollegeOrUniversity_minimum3')
#top_100_path = os.path.join(data_path, 'CollegeOrUniversity/CollegeOrUniversity_rest')

In [None]:
# School
#top_100_path = os.path.join(data_path, 'School/School_top100')
#top_100_path = os.path.join(data_path, 'School/School_minimum3')
#top_100_path = os.path.join(data_path, 'School/School_rest')

In [None]:
# Hospital
#top_100_path = os.path.join(data_path, 'Hospital/Hospital_top100')
#top_100_path = os.path.join(data_path, 'Hospital/Hospital_minimum3')
#top_100_path = os.path.join(data_path, 'Hospital/Hospital_rest')

In [None]:
# EducationalOrganization
#top_100_path = os.path.join(data_path, 'EducationalOrganization/EducationalOrganization_top100')
#top_100_path = os.path.join(data_path, 'EducationalOrganization/EducationalOrganization_minimum3')
#top_100_path = os.path.join(data_path, 'EducationalOrganization/EducationalOrganization_rest')

In [None]:
# SportsEvent
#top_100_path = os.path.join(data_path, 'SportsEvent/SportsEvent_top100')
#top_100_path = os.path.join(data_path, 'SportsEvent/SportsEvent_minimum3')
#top_100_path = os.path.join(data_path, 'SportsEvent/SportsEvent_rest')

In [None]:
# Movie
#top_100_path = os.path.join(data_path, 'Movie/Movie_top100')
#top_100_path = os.path.join(data_path, 'Movie/Movie_minimum3')
#top_100_path = os.path.join(data_path, 'Movie/Movie_rest')

In [None]:
# MusicAlbum
#top_100_path = os.path.join(data_path, 'MusicAlbum/MusicAlbum_top100')
#top_100_path = os.path.join(data_path, 'MusicAlbum/MusicAlbum_minimum3')
#top_100_path = os.path.join(data_path, 'MusicAlbum/MusicAlbum_rest')

In [None]:
# MusicRecording
#top_100_path = os.path.join(data_path, 'MusicRecording/MusicRecording_top100')
#top_100_path = os.path.join(data_path, 'MusicRecording/MusicRecording_minimum3')
#top_100_path = os.path.join(data_path, 'MusicRecording/MusicRecording_rest')

In [None]:
# Hotel
#top_100_path = os.path.join(data_path, 'Hotel/Hotel_top100')
#top_100_path = os.path.join(data_path, 'Hotel/Hotel_minimum3')
#top_100_path = os.path.join(data_path, 'Hotel/Hotel_rest')

In [12]:
# JobPosting
#top_100_path = os.path.join(data_path, 'JobPosting/JobPosting_top100')
#top_100_path = os.path.join(data_path, 'JobPosting/JobPosting_minimum3')
#top_100_path = os.path.join(data_path, 'JobPosting/JobPosting_rest')

In [21]:
# Recipe
#top_100_path = os.path.join(data_path, 'Recipe/Recipe_top100')
#top_100_path = os.path.join(data_path, 'Recipe/Recipe_minimum3')
#top_100_path = os.path.join(data_path, 'Recipe/Recipe_rest')

In [32]:
# Event
#top_100_path = os.path.join(data_path, 'Event/Event_top100')
#top_100_path = os.path.join(data_path, 'Event/Event_minimum3')
#top_100_path = os.path.join(data_path, 'Event/Event_rest')

In [36]:
# Book
#top_100_path = os.path.join(data_path, 'Book/Book_top100')
#top_100_path = os.path.join(data_path, 'Book/Book_minimum3')
#top_100_path = os.path.join(data_path, 'Book/Book_rest')

In [40]:
# CreativeWork
#top_100_path = os.path.join(data_path, 'CreativeWork/CreativeWork_top100')
#top_100_path = os.path.join(data_path, 'CreativeWork/CreativeWork_minimum3')
#top_100_path = os.path.join(data_path, 'CreativeWork/CreativeWork_rest')

In [60]:
# LocalBusiness
#top_100_path = os.path.join(data_path, 'LocalBusiness/LocalBusiness_top100')
top_100_path = os.path.join(data_path, 'LocalBusiness/LocalBusiness_minimum3')
#top_100_path = os.path.join(data_path, 'LocalBusiness/LocalBusiness_rest')

In [23]:
# Person
#top_100_path = os.path.join(data_path, 'Person/Person_top100')
#top_100_path = os.path.join(data_path, 'Person/Person_minimum3')
#top_100_path = os.path.join(data_path, 'Person/Person_rest')

In [28]:
# Product 
#top_100_path = os.path.join(data_path, 'Product/Product_top100')
#top_100_path = os.path.join(data_path, 'Product/Product_minimum3')
#top_100_path = os.path.join(data_path, 'Product/Product_rest')

In [56]:
# Place
#top_100_path = os.path.join(data_path, 'Place/Place_top100')
#top_100_path = os.path.join(data_path, 'Place/Place_minimum3')
#top_100_path = os.path.join(data_path, 'Place/Place_rest')

In [52]:
# Restaurant
#top_100_path = os.path.join(data_path, 'Restaurant/Restaurant_top100')
#top_100_path = os.path.join(data_path, 'Restaurant/Restaurant_minimum3')
#top_100_path = os.path.join(data_path, 'Restaurant/Restaurant_rest')

In [61]:
remove_irrelevant_tlds()

['LocalBusiness_fusionpeluqueria.com_September2020.json.gz', 'LocalBusiness_morningpointe.com_September2020.json.gz', 'LocalBusiness_linns.com_September2020.json.gz', 'LocalBusiness_ilprogettoeu.com_September2020.json.gz', 'LocalBusiness_news-herald.net_September2020.json.gz', 'LocalBusiness_tripmatchmaker.com_September2020.json.gz', 'LocalBusiness_lithgowmercury.com.au_September2020.json.gz', 'LocalBusiness_searchonamerica.com_September2020.json.gz', 'LocalBusiness_avoriaz.com_September2020.json.gz', 'LocalBusiness_chemdryofcharleston.com_September2020.json.gz', 'LocalBusiness_norfolkplaces.co.uk_September2020.json.gz', 'LocalBusiness_mikebrewermotors.com_September2020.json.gz', 'LocalBusiness_flexibuy.com.au_September2020.json.gz', 'LocalBusiness_flux-dance.com_September2020.json.gz', 'LocalBusiness_danielschoch.com_September2020.json.gz', 'LocalBusiness_jkerobotics.com_September2020.json.gz', 'LocalBusiness_vistahomeimprovement.com_September2020.json.gz', 'LocalBusiness_opcare.co.uk

In [62]:
remove_with_fasttext()

In [63]:
# list all top 100 product files
files = [file for file in os.listdir(top_100_path) if file.endswith('.json.gz')]
print(len(files))

empty_dfs = 0
for file in files:
        df = pd.read_json(os.path.join(top_100_path, '{}'.format(file)), compression='gzip', lines=True)
        if len(df) == 0:
          os.remove(os.path.join(top_100_path, '{}'.format(file)))


files = [file for file in os.listdir(top_100_path) if file.endswith('.json.gz')]
print('Non empty dfs:', len(files))

31602
Non empty dfs: 14943
