In [None]:
import re
import ast
import numpy as np
import pandas as pd
from glob import glob

pd.set_option('display.max_columns', None)

In [None]:
data = pd.DataFrame()
path = '../00_shared_data/Cookie-and-Ads-Extensions/Data/'
for fname in glob(path+'*.csv'):
    print(f'Number of entries in data: {len(data)}')
    print(fname)
    data = pd.concat([data, pd.read_csv(fname)], ignore_index=True)
print(f'Number of entries in data: {len(data)}')

In [None]:
# Preprocess data
data.drop(columns=['extension-link', 'similars-links-href', 'developer', 'screenshots-src', 'categories-href'], inplace=True)
data.rename(columns={'extension-link-href': 'webstore-url', 'developer-href': 'developer', 'rating_count': 'rating-count', 'similars-links': 'similar-extensions', 'logo-src': 'logo'}, inplace=True)
data['webstore-url'] = data['webstore-url'].apply(lambda x: x.replace('https://chromewebstore.google.com/detail', ''))

# postprocess user-count
data['user-count'] = data['user-count'].apply(lambda x: x.replace(',', ''))
data['user-count'] = data['user-count'].apply(lambda x: int(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else 0)

# postprocess rating-count
data['rating-count'] = data['rating-count'].apply(lambda x: 0 if x is np.nan else x)
data['rating-count'] = data['rating-count'].apply(lambda x: x.replace('ratings', '').replace('rating', '').strip(' ') if type(x) == str else x)
data['rating-count'] = data['rating-count'].apply(lambda x: x.replace('.', '').replace('K', '00').strip(' ') if type(x) == str and '.' in x else x)
data['rating-count'] = data['rating-count'].apply(lambda x: x.replace('.', '').replace('K', '000').strip(' ') if type(x) == str else x)
data['rating-count'] = data['rating-count'].astype(int)

# postprocess screenshots
data['screenshots'] = data['screenshots'].apply(lambda x: [i['screenshots-src'] for i in ast.literal_eval(x)] if x is not np.nan and x != '[]' else [])
data['screenshots_main'] = data['screenshots'].apply(lambda x: x[0] if x else np.nan)

# postprocess similar-extensions
data['similar-extensions'] = data['similar-extensions'].apply(lambda x: [i['similars-links-href'].replace('./detail', '') for i in ast.literal_eval(x)] if x is not np.nan and x != '[]' else [])

# postprocess categories
data['categories-links'] = data['categories'].apply(lambda x: [i['categories-href'].lstrip('.') for i in ast.literal_eval(x)] if x is not np.nan and x != '[]' else [])
data['categories'] = data['categories'].apply(lambda x: [i['categories'].lstrip('.') for i in ast.literal_eval(x)] if x is not np.nan and x != '[]' else [])

print(f'Number of entries before dropping duplicates: {len(data)}')

data['query'] = data.groupby('webstore-url')['web-scraper-start-url'].transform(lambda x: ', '.join(x).replace('https://chromewebstore.google.com/search/', ''))
data.drop_duplicates('webstore-url', inplace=True)


column_order = ['name', 'description', 'webstore-url', 'url', 'url-href', 'categories', 'categories-links', 'similar-extensions', 
                'featured', 'logo', 'screenshots', 'screenshots_main', 'user-count', 'rating', 'rating-count', 'version', 'last-updated', 'languages', 'size', 'developer', 
                'offered_by', 'trader-status', 'privacy-notice', 'query', 'web-scraper-order']

data = data[column_order]

print(f'Number of entries after dropping duplicates: {len(data)}')
print()

data.head(3)

In [None]:
# Duplicates by name
len(data) - len(data.drop_duplicates('name'))

In [None]:
data.groupby('name').size().sort_values(ascending=False).head(25)

In [None]:
fname_processed = '../00_shared_data/Cookie-and-Ads-Extensions/Chrome-Extensions-Cookie-Ads-Consent_preprocessed.csv'
data.to_csv(fname_processed, index=False, sep='\t')