In [None]:
import json
import random
from pathlib import Path

import pandas as pd
import plotly_express as px
from IPython.display import display
from plotly.offline import init_notebook_mode
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

In [None]:
init_notebook_mode(connected=True)

## Format conversion

In [None]:
def read_notebook(json_notebook, orderings):
    """
    Read .json notebook in AI4Code/train folder.
    Args:
        json_notebook: Path to .json notebook.
        orderings: AI4Code/train_orders.csv as `pd.DataFrame` having
            `cell_order` as index.

    Returns:
        `notebook_id`: Notebook id in `json_notebook` ex: '0a0ae595d33e5e'.
        `notebook`: Notebook as dict.
        `ordering`: Value in `cell_order` in train_orders.csv.
    """
    notebook_id = Path(json_notebook).stem
    ordering = orderings.loc[notebook_id]['cell_order']
    with open(json_notebook) as raw:
        notebook = json.load(raw)
    return notebook_id, notebook, ordering


def get_notebook_cells(ordering, notebook):
    """
    Assign notebook cells to their respective order, add metadata
        for .ipynb generation.
    Args:
        `ordering`: Value in `cell_order` in train_orders.csv.
        `notebook`: Notebook as dict.

    Returns:
        cells: A list of dict. Each dict is a cell.
    """
    cells = []
    for i, cell_id in enumerate(ordering.split(), 1):
        cell = dict()
        cell['id'] = cell_id
        cell['cell_type'] = notebook['cell_type'][cell_id]
        cell['source'] = notebook['source'][cell_id]
        cell['metadata'] = {}
        cell['outputs'] = []
        cell['execution_count'] = i
        cells.append(cell)
    return cells


def make_displayable(json_notebook, base_notebook, orderings):
    """
    Convert raw .json notebooks to browser-friendly format.
    Args:
        json_notebook: Path to .json notebook.
        base_notebook: A dictionary of cell metadata.
        orderings: AI4Code/train_orders.csv as `pd.DataFrame` having
            `cell_order` as index.

    Returns:
        Updated notebook as dict.
    """
    _, notebook, ordering = read_notebook(json_notebook, orderings)
    base_notebook['cells'] = get_notebook_cells(ordering, notebook)
    return base_notebook


def create_df(src, orderings, ancestors, verbose=True):
    """
    Convert .json notebooks and corresponding data to `pd.DataFrame`
    Args:
        src: Path to `AI4Code/train`
        orderings: Path to `AI4Code/train_orders.csv`
        ancestors: Path to `AI4Code/train_ancestors.csv`
        verbose: If False, progress will not be displayed.

    Returns:
        Converted dataset as `pd.DataFrame` having columns:
        [`cell_id`, `cell_type` `source`, `order` `notebook_id`
            `ancestor_id`, `parent_id`]
    """
    orderings = pd.read_csv(orderings).set_index('id')
    ancestors = pd.read_csv(ancestors).set_index('id')
    data = []
    json_notebooks = [*Path(src).glob('*.json')]
    total_notebooks = len(json_notebooks)
    for i, json_notebook in enumerate(json_notebooks, 1):
        notebook_id, notebook, ordering = read_notebook(
            json_notebook.as_posix(), orderings
        )
        ancestor_id, parent_id = ancestors.loc[notebook_id][
            ['ancestor_id', 'parent_id']
        ]
        cells = get_notebook_cells(ordering, notebook)
        df = (
            pd.DataFrame(cells)
            .rename(columns={'id': 'cell_id', 'execution_count': 'order'})
            .drop(['metadata', 'outputs'], axis=1)
        )
        df['notebook_id'] = notebook_id
        df['ancestor_id'] = ancestor_id
        df['parent_id'] = parent_id
        data.append(df)
        if verbose:
            print(f'\rParsed {i}/{total_notebooks} files', end='')
    if verbose:
        print()
    return pd.concat(data)


def create_nb_samples(src, dest, orderings, metadata, n=50, verbose=True, **kwargs):
    """
    Convert a number of .json notebooks in `AI4Code/train` to browser-friendly
        format and save the results as .ipynb.
    Args:
        src: Path to `AI4Code/train`.
        dest: Path to output directory where the notebooks will be saved.
        orderings: Path to `AI4Code/train_orders.csv`.
        metadata: Path to `ipynb-metadata.json`.
        n: Total notebooks to convert.
        verbose: If False, progress will not be displayed.
        **kwargs: kwargs passed to `json.dumps`

    Returns:
        None
    """
    Path(dest).mkdir(exist_ok=True, parents=True)
    orderings = pd.read_csv(orderings).set_index('id')
    json_files = random.sample([*Path(src).glob('*.json')], n)
    total = len(json_files)
    with open(metadata) as base:
        base_notebook = json.load(base)
    for i, f in enumerate(json_files, 1):
        notebook = make_displayable(f.as_posix(), base_notebook, orderings)
        with open((Path(dest) / f'{f.stem}.ipynb').as_posix(), 'w') as converted:
            converted.write(json.dumps(notebook, **kwargs))
            if verbose:
                print(f'\rSaved {i}/{total} notebooks', end='')
    if verbose:
        print()


### Conversion Json --> Parquet, IPYNB

The following section parses the .json files and converts to a pandas dataframe that has the following columns:

* cell_id
* cell_type
* source
* order
* notebook_id
* ancestor_id
* parent_id

Then results are saved to `ai4code.parquet` for further processing.

In [None]:
%%time

df = create_df(
    '../input/AI4Code/train',
    '../input/AI4Code/train_orders.csv',
    '../input/AI4Code/train_ancestors.csv',
)
df.to_parquet('ai4code.parquet', index=False)

In [None]:
df.head()

In [None]:
df.info()

#### Create sample notebooks that can be viewed in browser

In [None]:
create_nb_samples(
    '../input/AI4Code/train',
    'notebook-samples',
    '../input/AI4Code/train_orders.csv',
    '../input/ai4code-ipynb-metadata/ipynb-metadata.json',
)

Resulting notebooks are saved to `notebook-samples.zip`

## Preprocessing & EDA

In [None]:
def plot_source_counts(df, **kwargs):
    """
    Display code/markdown frequencies.
    Args:
        df: ai4code dataset as `pd.DataFrame` having columns:
            [`cell_id`, `cell_type` `source`, `order` `notebook_id`
            `ancestor_id`, `parent_id`]
        **kwargs: kwargs passed to `plotly.express._chart_types.pie`

    Returns:
        plotly.graph_objs._figure.Figure
    """
    title = 'Source frequencies'
    counts = df['cell_type'].value_counts()
    return px.pie(
        counts, names=counts.index, values=counts.values, title=title, **kwargs
    )


def plot_ngram(frequencies, n=None, **kwargs):
    """
    Plot ngram word frequencies.
    Args:
        frequencies: `pd.DataFrame` having columns: `word`, `frequency`.
        n: `n` passed to `pd.DataFrame.head`.
        **kwargs: kwargs passed to plotly.express._chart_types.histogram.

    Returns:
        plotly.graph_objs._figure.Figure
    """
    frequencies = frequencies.head(n).sort_values(by='frequency')
    return px.histogram(frequencies, x='frequency', y='word', **kwargs)


def generate_ngram(df, cell_type, **kwargs):
    """
    Generate ngram word frequencies.
    Args:
        df: ai4code dataset as `pd.DataFrame` processed by
            `ai4code.preprocessing.cleanup_text` having columns:
            [`cell_id`, `cell_type` `source`, `order` `notebook_id`
            `ancestor_id`, `parent_id`]
        cell_type: str, `code` or `markdown`.
        **kwargs: kwargs passed to `sklearn.feature_extraction.text.CountVectorizer`

    Returns:
        frequencies: `pd.DataFrame` having columns: `word`, `frequency`.
    """
    corpus = df[df['cell_type'] == cell_type]['source']
    vec = CountVectorizer(**kwargs)
    vec.fit(corpus)
    word_sum = vec.transform(corpus).sum(axis=0)
    frequencies = pd.Series(
        {word: word_sum[0, idx] for word, idx in vec.vocabulary_.items()}
    ).reset_index()
    frequencies.columns = ['word', 'frequency']
    return frequencies.sort_values(by='frequency', ascending=False)


def generate_wordcloud(frequencies, **kwargs):
    """
    Generate word cloud given word frequencies.
    Args:
        frequencies: `pd.DataFrame` having columns: `word`, `frequency`.
        **kwargs: kwargs passed to `wordcloud.wordcloud.WordCloud`.

    Returns:
        `wordcloud.wordcloud.WordCloud`
    """
    wc = WordCloud(**kwargs)
    wc.fit_words(dict(frequencies.values))
    return wc


def cleanup_text(df):
    """
    Cleanup `source` in place.
    Args:
        df: ai4code dataset as `pd.DataFrame` having columns:
            [`cell_id`, `cell_type` `source`, `order` `notebook_id`
            `ancestor_id`, `parent_id`]
    Returns:
        None
    """
    replacements = {
        r'http\S+': '',
        r'<.*?>': '',
        r'\d+': '',
        r'[^\x00-\x7F]+': '',
        r'don\'t': 'do not',
        r'won\'t': 'will not',
        r'can\'t': 'cannot',
        r'i\'m': 'i am',
        r'n\'t': ' not',
        r'\'re': ' are',
        r'\'s': ' is',
        r'\'d': ' would',
        r'\'ll': ' will',
        r'\'ve': ' have',
        r'\'m': ' am',
        r'\r': '',
        r'[.|,|)|(|\|/]': ' ',
        r'^\s+': '',
        r' +': ' ',
    }
    df['source'].str.lower().replace({r'\’': "\'"}, regex=True, inplace=True)
    df['source'].replace(replacements, regex=True, inplace=True)


def generate_ngrams(df, ranges, labels, cell_type):
    """
    Generate and save word frequencies.
    Args:
        df: ai4code dataset as `pd.DataFrame` having columns:
            [`cell_id`, `cell_type` `source`, `order` `notebook_id`
            `ancestor_id`, `parent_id`]
        ranges: A list of pairs passed to `ai4code.preprocessing.create_ngram`
                each. ex: [(1, 1), (2, 2)]
        labels: A list titles having the same size as `ranges` ex: ['unigram', 'bigram']
        cell_type: str, `code` or `markdown`.

    Raises: AssertionError for length mismatch.

    Returns:
        A list of `pd.DataFrame`
    """
    assert len(ranges) == len(labels), (
        f'`ranges` and `labels` should have equal sizes, got '
        f'{len(ranges), len(labels)}'
    )
    frequencies = []
    for n, label in zip(ranges, labels):
        freq = generate_ngram(df, cell_type, ngram_range=n, stop_words='english')
        frequencies.append(freq)
        freq.to_parquet(
            f'{label}_{cell_type}_frequencies.parquet',
            index=False,
        )
    return frequencies

In [None]:
%%time

cleanup_text(df)

In [None]:
fig = plot_source_counts(df, template='plotly_dark')
fig.write_html('source-frequencies.html')
fig.show()

In [None]:
%%time

ngram_ranges = (1, 1), (2, 2), (3, 3)
ngram_labels = 'unigram', 'bigram', 'trigram'
frequencies = generate_ngrams(df, ngram_ranges, ngram_labels, 'markdown')
for label, freq_df in zip(ngram_labels, frequencies):
    title = f'Markdown {label} frequencies'
    fig = plot_ngram(
        freq_df[freq_df['frequency'] > 5],
        50,
        template='plotly_dark',
        title=title,
    )
    fig.write_html(f"{title.lower().replace(' ', '-')}.html")
    fig.show()

In [None]:
wc = generate_wordcloud(frequencies[0], width=2500, height=1200)
wc.to_file('word-cloud-markdown.jpg')
display(wc.to_image())

In [None]:
%%time

ngram_ranges = (1, 1), (2, 2)
ngram_labels = 'unigram', 'bigram'
frequencies = generate_ngrams(df, ngram_ranges, ngram_labels, 'code')
for label, freq_df in zip(ngram_labels, frequencies):
    title = f'Code {label} frequencies'
    fig = plot_ngram(
        freq_df[freq_df['frequency'] > 5],
        50,
        template='plotly_dark',
        title=title,
    )
    fig.write_html(f"{title.lower().replace(' ', '-')}.html")
    fig.show()

In [None]:
wc = generate_wordcloud(frequencies[0], width=2500, height=1200)
wc.to_file('word-cloud-code.jpg')
display(wc.to_image())