In [1]:
import os
import random
import json
from pprint import pprint

In [2]:
class Build_Profiles:
    # Specify data location, and output location.
    def __init__(self, input_path='data', output_path='output'):
        '''
        Input:
            input_path: data dir, default '/data'
            output_put: output dir, default '/ouput', make dir if not exists
        '''
        self.input_path = input_path
        self.output_path = output_path
        if not os.path.exists(self.input_path):
            raise Exception('Input path not found, or /Data folder not exist.')
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)
        # Average profile details, keyed with last word of each json profile filename
        self.profiles = {}
        # *_dirs are the root directories path for files of profiles, base/related videos
        self.profiles_dir, self.videos_base_dir, self.videos_related_dir = '', '', ''
        # Names of files are stored separately, used together with dir path above
        self.profiles_files, self.videos_base_files, self.videos_related_files = [], [], []
        # Profile videos for each profile, keyed with filename of each json base-video filename
        # Non-detailed version only contains urls in a List,
        # Detailed version, keyed by the url, contains author, timestamp, score, and title.
        self.base_videos, self.base_videos_details, self.base_videos_summary = {}, {}, {}
        # Extended/related profile videos for each profile, keyed with filename of each json related-video filename
        self.related_videos, self.related_videos_details, self.related_videos_summary = {}, {}, {}
        # Run to parse the input folder
        self._parse_folders()
        # Run to generate profile details
        self._generate_profiles()

    # Parse data folder
    def _parse_folders(self):
        path = self.input_path
        for (cur_dirpath, sub_dirnames, cur_filenames) in os.walk(path):
            # Average profile files
            if 'profiles' in cur_dirpath:
                self.profiles_dir = cur_dirpath
                self.profiles_files = cur_filenames
            # Base videos, parse json files only
            elif 'videos' in cur_dirpath and 'base' in cur_dirpath:
                self.videos_base_dir = cur_dirpath
                self.videos_base_files = [
                    each for each in cur_filenames if 'ndjson' not in each]
            # Related videos, parse json files only
            elif 'videos' in cur_dirpath and 'related' in cur_dirpath and 'ndjson' not in cur_dirpath:
                self.videos_related_dir = cur_dirpath
                self.videos_related_files = cur_filenames

    # Read json file
    def _read_json(self, path):
        '''
        Input: 
            path: file path
        Return:
            json file, lenght, type
        '''
        with open(path) as f:
            jfile = json.load(f)
        return jfile, len(jfile), type(jfile)

    # Populate profile details
    def _generate_profiles(self):
        for profile in self.profiles_files:
            if 'ndjson' in profile or 'table' in profile:  # Skip ndjson
                continue
            # Use last word as key
            name = os.path.splitext(profile)[0].split('_', 2)[-1]
            profile_path = os.path.join(self.profiles_dir, profile)
            self.profiles[name] = self._read_json(profile_path)[0]

    # Collect all the videos from a json file
    def _load_videos(self, file):
        '''
        Input:
            file: json file path
        Return:
            video list, and detailed video dict keyed with url
        '''
        videos_short, videos_details = [], []
        videos, length, _ = self._read_json(file)
        for video in videos:
            videos_short.append(video['url'])
            details = video.copy()
            del details['url']
            videos_details.append({video['url']: details})
        return videos_short, videos_details

    # Collect all the base videos for all profiles
    def _build_profiles_base(self, limit):
        '''
        Input:
            limit: int, how many top videos to choose
        '''
        for file in self.videos_base_files:
            path = os.path.join(self.videos_base_dir, file)
            v, v_d = self._load_videos(path)
            key = os.path.splitext(file)[0]
            self.base_videos[key] = v[:limit]
            self.base_videos_details[key] = v_d[:limit]
            self.base_videos_summary[key] = len(self.base_videos[key])

    # Give data and file_path (need include .json extension), write file, overwrite if exists.
    def _write_json(self, data, file):
        '''
        Input:
            data: data to write
            file: file path to write
        '''
        with open(file, 'w') as f:
            # Indent will help json viewer properly display the format
            json.dump(data, f, indent=4)

    # Generate files based on the base output
    # Only output file will be shuffled is shuffle is True, the class attributes will not be shuffled,
    # to prevent uneven shuffle after sampled with related videos
    # Turn details False will stop write detailed version of videos
    def output_profiles_base(self, shuffle=False, details=True, limit=50):
        '''
        Input:
            shuffle: boolean, whether to shuffle the ourput
            details: boolean, whether to output detailed version
            limit: int, how many to output
        '''
        # Run to generate base profile videos
        self._build_profiles_base(limit)
        out_path = self.output_path
        for file, videos in self.base_videos.items():
            name_base = 'base_videos_' + file + '.json'
            data = videos[:]
            if shuffle:
                random.shuffle(data)
            self._write_json(data, os.path.join(out_path, name_base))
        self._write_json(self.base_videos_summary,
                         os.path.join(out_path, 'base_summary.json'))
        if details:
            for file, videos in self.base_videos_details.items():
                name_base_details = 'base_videos_details_' + file + '.json'
                data = videos[:]
                if shuffle:
                    random.shuffle(data)
                self._write_json(data, os.path.join(out_path, name_base_details))

    # Read a json profile file, output content, lenght, type. For external use.
    def read_json(self, file_path):
        return self._read_json(file_path)

    # Read a json video file, output short list (url only), detailed list. For external use.
    def read_videos(self, file_path):
        return self._load_videos(file_path)

    def _load_related_videos(self, file):
        '''
        Input:
            file: json file path
        Return:
            video list, and detailed video dict keyed with url
        '''
        videos_short, videos_details = [], []
        videos, length, _ = self._read_json(file)
        for video in videos['data']:
            videos_short.append(video['url'])
            details = video.copy()
            del details['url']
            videos_details.append({video['url']: details})
        return videos_short, videos_details
    
    def _build_profiles_related(self, limit):
        for file in self.videos_related_files:
            key = os.path.splitext(file)[0]
            path = os.path.join(self.videos_related_dir, file)
            v, v_d = self._load_related_videos(path)
            self.related_videos[key] = v[:limit]
            self.related_videos_details[key] = v_d[:limit]
            self.related_videos_summary[key] = len(self.related_videos[key])

    def output_profiles_related(self, shuffle=False, details=True, limit=50):
        self._build_profiles_related(limit)
        pass

In [3]:
BP = Build_Profiles()

In [4]:
BP.profiles.keys()


dict_keys(['enoughtrumpspam', 'feminism', 'incel', 'inceltears', 'mensrights', 'metoo', 'the_donald'])

In [5]:
BP.videos_base_files

['enoughtrumpspam.json',
 'feminism.json',
 'incel.json',
 'inceltears.json',
 'mensrights.json',
 'metoo.json',
 'the_donald.json']

In [6]:
BP.profiles.keys()

dict_keys(['enoughtrumpspam', 'feminism', 'incel', 'inceltears', 'mensrights', 'metoo', 'the_donald'])

In [4]:
BP.output_profiles_base(shuffle=False, details=True, limit=50)

In [6]:
BP.output_profiles_related()

In [7]:
len(BP.related_videos)

40134

In [8]:
len(BP.related_videos_summary)

40134

In [12]:
for i, p in enumerate(BP.related_videos_summary.items()):
    print(p)
    if i > 100:
        break

('00mph', 9)
('00sMusic', 5)
('00sRock', 50)
('03greedo', 5)
('07Scape', 50)
('093game', 9)
('09zero', 1)
('0ad', 42)
('0b0t', 1)
('0chain', 5)
('0w0', 4)
('0x2a_personal', 1)
('0xbitcoin', 19)
('0xcert', 15)
('0xProject', 50)
('1000thworldproblems', 4)
('1001Movies', 4)
('100DaysofKeto', 1)
('100kClub', 5)
('100pushups', 25)
('100sets', 4)
('100sexiest', 7)
('100thieves', 50)
('100yearclub', 15)
('100yearsago', 50)
('101Wicca', 8)
('1022', 29)
('1037_Studios', 1)
('1046FM', 7)
('10cloverfieldlane', 16)
('10mm', 9)
('10s', 8)
('112263Hulu', 9)
('117thOSINT', 1)
('11foot8', 30)
('11MRadio', 5)
('1200isfineIGUESSugh', 3)
('1200isjerky', 35)
('1200isplenty', 50)
('1200isplentyketo', 50)
('120db', 21)
('12Monkeys', 46)
('12thMan', 18)
('1337Foundation', 6)
('13451452251849519', 50)
('13or30', 50)
('13ReasonsWhy', 50)
('13reasonswhyhate', 17)
('13thage', 2)
('14ers', 19)
('1500isplenty', 50)
('17776', 12)
('18650', 20)
('18650BatteryPacks', 3)
('18650masterrace', 21)
('18XX', 12)
('18_19', 