# AI4Code, A simple Neuronal Network
Hello this models will implement a simple Neuronal Net to tackle the prediction problem...

## Work in Progress, Come Back Soon ✨

**Competition Description...**

The goal of this competition is to understand the relationship between code and comments in Python notebooks. You are challenged to reconstruct the order of markdown cells in a given notebook based on the order of the code cells, demonstrating comprehension of which natural language references which code.

In [None]:
%%time
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
examples = 15
counter = 0
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        counter += 1
        print(os.path.join(dirname, filename))
        if counter > examples:
            break

print('')            
print('Done...')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
# Import the requiered libraries...
import json
from pathlib import Path

from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('../input/AI4Code')

In [None]:
%%time
NUM_TRAIN = 10_000

# Define a function to read a json file...
def read_notebook(path):
    """
    Read a json file and import it to a dataframe...
    Args:
        path (str): The filepath location to be loaded into the dataframe.
    Returns:
        df (dataframe): A dataframe with the json information.
    """

    types = {'cell_type': 'category', 'source': 'str'}
    df = pd.read_json(path, dtype = types).assign(id = path.stem).rename_axis('cell_id')
    return df 

# Create a list of the locations of all the training files...
paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]

# Create a list of dataframes from json files...
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')]

In [None]:
%%time
# Concat the dataframes from the notebooks_train into one frame...

trn_data = pd.concat(notebooks_train).set_index('id', append = True).swaplevel().sort_index(level = 'id', sort_remaining = False)

In [None]:
%%time
# Display th efirst five to ten rows of data...

trn_data.head(10)

In [None]:
%%time
# Display th efirst five to ten rows of data...

# Get an example notebook
nb_id = trn_data.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = trn_data.loc[nb_id, :]
display(nb)
print()

In [None]:
%%time
df_orders = pd.read_csv('/kaggle/input/AI4Code/train_orders.csv')
df_orders.head()

In [None]:
%%time
df_orders = pd.read_csv(data_dir / 'train_orders.csv', index_col='id',squeeze=True,).str.split()  # Split the string representation of cell_ids into a list
df_orders.head()

In [None]:
%%time
# Get the correct order
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

In [None]:
%%time
#...
print(cell_order)

In [None]:
%%time
#...
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)
nb

In [None]:
%%time
# Convert the df_orders to a dataframe...
df_orders = df_orders.to_frame()
df_orders.head()

In [None]:
%%time
# Join the df_orders_ dataframe with the json dataframe...
trn_data_grouped = trn_data.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
trn_data_grouped.head()

In [None]:
%%time
df_orders_ = df_orders.join(trn_data_grouped, how = 'right')
df_orders_.head()

In [None]:
%%time
ranks = {} # Creates an empty dictionary of ranks...
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

In [None]:
%%time
df_ranks = pd.DataFrame.from_dict(ranks, orient = 'index').rename_axis('id').apply(pd.Series.explode).set_index('cell_id', append = True)
df_ranks.head()

In [None]:
%%time
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

In [None]:
%%time
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits = 1, test_size = NVALID, random_state = 0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = trn_data.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups = ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = trn_data.loc[ids_train, :]
df_valid = trn_data.loc[ids_valid, :]

In [None]:
df_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Training set
tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [None]:
tfidf.get_feature_names_out()

In [None]:
X_train.shape

In [None]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)

In [None]:
y_train

In [None]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

In [None]:
# Validation set
X_valid = tfidf.transform(df_valid['source'].astype(str))
# The metric uses cell ids
y_valid = df_orders.loc[ids_valid]

X_valid = sparse.hstack((
    X_valid,
    np.where(
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [None]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)
y_pred.head(10)

In [None]:
nb_id = df_valid.index.get_level_values('id').unique()[8]

display(trn_data.loc[nb_id])
display(trn_data.loc[nb_id].loc[y_pred.loc[nb_id]])

In [None]:
from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [None]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)