In [None]:
!pip install pyyaml
!pip install nbconvert

In [100]:
import os
import datetime
import pandas as pd
import time
import json
import yaml
import tqdm

In [82]:
PATH = './data/sorted/python/prepared'
REPOS = os.listdir(PATH)
DEV = True

COLUMNS   = ['qname',
            'Description',
            'Sourcecode',
            'Keywords',
            'Author']


if DEV: 
    f = 'DeepLearning'

In [117]:
def parse_metainfo(path: str,
                   columns: list) -> pd.DataFrame:
    

    Quantlet = pd.DataFrame(columns=columns, index=[0])

    with open(path, 'r') as f:
        metainfo_file = yaml.safe_load(f)

    for col in columns:
        try: 
            if col=='qname': 
                qname = metainfo_file.get('Name of QuantLet', 'ERROR')
                if qname=='ERROR':
                    qname = metainfo_file.get('Name of Quantlet', 'ERROR')
                Quantlet[col] = qname
            elif col=='Sourcecode':
                Quantlet.loc[0,col] = ['BEGINNING OF CODE']
            else:
                Quantlet[col] = metainfo_file[col]
        except:
            Quantlet[col] = 'ERROR'

    return Quantlet


def parse_quantlet(quantlet_name: str,
                    path: str,
                    columns: list) -> dict:
    """
    DOCSTRING
    """
    
    QUANTLET_PATH = path + '/' + quantlet_name

    FILES = os.listdir(QUANTLET_PATH)

    metainfo_file = [s for s in FILES if ("metainfo" in s) or ("Metainfo" in s)  or ("MetaInfo" in s)][0]

    META_PATH = QUANTLET_PATH + '/' + metainfo_file

    Quantlet = parse_metainfo(META_PATH, 
                              columns)


    for file_name in FILES:

        if ("metainfo" in file_name) or  \
            ("Metainfo" in file_name) or \
            ("MetaInfo" in file_name) or \
             (".DS_Store" in file_name) or \
             (".git" in file_name) or \
             (".Rhistory" in file_name) or \
             (".idea" in file_name) or \
             (".ipynb_checkpoints" in file_name):  
                continue

        FILE_PATH = QUANTLET_PATH + '/' + file_name
        
        assert os.path.isfile(FILE_PATH)

        if ('ipynb' in file_name):
            os.popen(f"jupyter nbconvert --to python {FILE_PATH}")
            file_name.replace('ipynb', 'py')

        assert 'py' in file_name

        with open(FILE_PATH, 'r') as f:
            source_code = f.readlines()

        Quantlet.loc[0,'Sourcecode'].append(source_code)

    return Quantlet


def quantlets_to_df(quantlet_list: list,
                path: str,
                columns: list) -> pd.DataFrame:
    """
    DOCSTRING
    """ 

    quantlets_df = pd.DataFrame(
                    #index=quantlet_list,
                    columns=columns
                    )
    for quantlet in tqdm.tqdm(quantlet_list):
        if '.DS_Store' in quantlet:
            continue
        parsed_q = parse_quantlet(quantlet,
                                    path,
                                    columns)
        parsed_q['REPO_NAME'] = quantlet                                   
        quantlets_df = pd.concat([quantlets_df, parsed_q], axis=0)
        

        
    print('Parsing Finished')
    return quantlets_df

In [None]:
df = quantlets_to_df(REPOS,
                PATH,
                COLUMNS)
df['code_scripts'] = df.Sourcecode.apply(lambda x: len(x) -1)   

In [None]:
df.to_csv('./data/python_quantlets.csv', index=False)