# Imports

In [None]:
import pandas as pd
import numpy as np
import json
import time
import gc
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from glob import glob
from plotly.subplots import make_subplots

In [None]:
path = "../input/AI4Code/"
train_path = path + 'train/'
test_path = path + 'test/'
train_orders = pd.read_csv(path + 'train_orders.csv')
sample_submission = pd.read_csv(path + "sample_submission.csv")
train_ancestors = pd.read_csv(path + 'train_ancestors.csv')

<p>Let's see what we have in train_orders.csv file</p>

In [None]:
print(train_orders.shape)
display(train_orders.head())
print("\n\n")
print(sample_submission.shape)
display(sample_submission.head())

<p>As we can see there train data contain 139256 notebooks and test data contain 4 notebooks and we have 2 columns in train_orders.csv. One is "id" column and another is "cell_order" column.</p>
<p>id columns contain id of notebook which is available in .json format. And cell_order columns contain order of cell in notebooks which is target column</p>

In [None]:
train_orders["path"] = train_orders['id'].apply(lambda x: train_path+x+".json")

In [None]:
target = train_orders["cell_order"].values[0]
target = target.split(" ")
temp_path = train_orders['path'].values[0]
with open(temp_path, 'r') as f:
    data = json.load(f)
temp_df = pd.DataFrame()
temp_df["cell_id"] = data['cell_type'].keys()
temp_df["cell_type"] = data["cell_type"].values()
temp_df["source"] = temp_df['cell_id'].apply(lambda x: data['source'][x])
temp_df["order"] = temp_df['cell_id'].apply(lambda x: target.index(x))
temp_df

<p>If we look at order of code cells. Code cells are already in order but markdown cells are shuffled.</p>
<p>Means markdown cell can come at the start of the notebook or at the end of the notebook it can come between to code cells or it can come next to markdown cell too</p>

In [None]:
def create_df(path,cell_order):
    cell_order = cell_order.split(" ")
    
    with open(path, 'r') as f:
        data = json.load(f)
    
    df = pd.DataFrame()
    df["cell_id"] = data["cell_type"].keys()
    df["cell_type"] = data["cell_type"].values()
    df["source"] = df["cell_id"].apply(lambda x: data["source"][x])
    df["order"] = df["cell_id"].apply(lambda x: cell_order.index(x))
    df = df.sort_values("order")
        
    return df

start = time.time()
train_orders['metadata'] =  train_orders.apply(lambda x: create_df(x['path'],x['cell_order']),1)
end = time.time()
duration = end - start
#print(duration)

In [None]:
del train_orders["path"]
gc.collect()

In [None]:
train_orders.head()

In [None]:
def c_next_to_c(df):
    """ 
    df: json file in df format which contain cell_id,cell_type, source, order columns already(sorted w.r.t to order)
    return: num of times code cell came next to code cell
    """
    array = df['cell_type'].values
    count = 0
    for idx, cell_type in enumerate(array[:-1]):
        if cell_type=="code":
            if cell_type == array[idx+1]:
                count+=1
    return count
def m_next_to_m(df):
    """ 
    df: json file in df format which contain cell_id,cell_type, source, order columns already(sorted w.r.t to order)
    return: num of times markdown cell came next to markdown cell
    """
    array = df['cell_type'].values
    count = 0
    for idx, cell_type in enumerate(array[:-1]):
        if cell_type=="markdown":
            if cell_type == array[idx+1]:
                count+=1
    return count
    
def c_next_to_m(df):
    """ 
    df: json file in df format which contain cell_id,cell_type, source, order columns already(sorted w.r.t to order)
    return: num of times code cell came next to markdown cell
    """
    array = df['cell_type'].values
    count = 0
    for idx, cell_type in enumerate(array[:-1]):
        if cell_type=="markdown":
            if array[idx+1] == "code":
                count+=1
    return count
def m_next_to_c(df):
    """ 
    df: json file in df format which contain cell_id,cell_type, source, order columns already(sorted w.r.t to order)
    return: num of times code markdown came next to code cell
    """
    array = df['cell_type'].values
    count = 0
    for idx, cell_type in enumerate(array[:-1]):
        if cell_type=="code":
            if array[idx+1] == "markdown":
                count+=1
    return count




In [None]:
train_orders["total_code_cell"] = train_orders['metadata'].apply(lambda x: len(x[x['cell_type']=='code']))
train_orders["total_markdown_cell"] = train_orders['metadata'].apply(lambda x: len(x[x['cell_type']=='markdown']))

train_orders["c_next_to_c"] = train_orders['metadata'].apply(lambda x: c_next_to_c(x))
train_orders["m_next_to_m"] = train_orders['metadata'].apply(lambda x: m_next_to_m(x))
train_orders["c_next_to_m"] = train_orders['metadata'].apply(lambda x: c_next_to_m(x))
train_orders["m_next_to_c"] = train_orders['metadata'].apply(lambda x: m_next_to_c(x))

train_orders["total_cell_next_to_cell"] = train_orders["c_next_to_c"] + train_orders["m_next_to_m"] + train_orders["c_next_to_m"] + train_orders["m_next_to_c"]

train_orders['n_c_next_to_c'] = train_orders["c_next_to_c"] / train_orders["total_cell_next_to_cell"]
train_orders["n_m_next_to_m"] = train_orders["m_next_to_m"] / train_orders["total_cell_next_to_cell"]
train_orders["n_c_next_to_m"] = train_orders["c_next_to_m"] / train_orders["total_cell_next_to_cell"]
train_orders["n_m_next_to_c"] = train_orders["m_next_to_c"] / train_orders["total_cell_next_to_cell"]

train_orders["startswith_code"] = train_orders['metadata'].apply(lambda x: 1 if x['cell_type'].values[0]=="code" else 0)
train_orders["startswith_markdown"] = train_orders['metadata'].apply(lambda x: 1 if x['cell_type'].values[0]=="markdown" else 0)

train_orders["endswith_code"] = train_orders['metadata'].apply(lambda x: 1 if x['cell_type'].values[-1]=="code" else 0)
train_orders["endswith_markdown"] = train_orders['metadata'].apply(lambda x: 1 if x['cell_type'].values[-1]=="markdown" else 0)

In [None]:
train_orders["total_num_cell"] = train_orders['total_code_cell'] + train_orders['total_markdown_cell']


In [None]:
train_orders.head()

# EDA

In [None]:
labels = ['code_cells', "markdown_cells"]
count = [sum(train_orders["total_code_cell"].values),sum(train_orders["total_markdown_cell"].values)]
fig = go.Figure(data=[go.Pie(labels=labels, values=count, pull=[0, 0.2])])
fig.show()

<p>We can see in above figure that 2/3rd of train data contain code_cells and 1/3 rd of train data contain markdown_cells. But this is not enough let's see some more details </p>

In [None]:
total_kernels = 100
startswith_code = sum(train_orders['startswith_code'].values) * 100 / len(train_orders)
startswith_markdown = sum(train_orders['startswith_markdown'].values) * 100 / len(train_orders)
endswith_code = sum(train_orders['endswith_code'].values) * 100 / len(train_orders)
endswith_markdown = sum(train_orders['endswith_markdown'].values) * 100 / len(train_orders)
sc_ec = sum(train_orders[train_orders["startswith_code"] == 1]['endswith_code'].values) * 100 / len(train_orders[train_orders["startswith_code"] == 1])
sc_em = sum(train_orders[train_orders["startswith_code"] == 1]['endswith_markdown'].values) * 100 / len(train_orders[train_orders["startswith_code"] == 1])
sm_ec = sum(train_orders[train_orders["startswith_markdown"] == 1]['endswith_code'].values) * 100 / len(train_orders[train_orders["startswith_markdown"] == 1])
sm_em = sum(train_orders[train_orders["startswith_markdown"] == 1]['endswith_markdown'].values) * 100 / len(train_orders[train_orders["startswith_markdown"] == 1])

labels1 = ['code','markdown']
values1 = [startswith_code, startswith_markdown]

labels2 = ["code","markdown"]
values2 = [endswith_code, endswith_markdown]

labels3 = ["startswith_code_endswith_code","startswith_code_endswith_markdown", "startswith_markdown_endswith_markdown","startswith_markdown_endswith_code"]
values3 = [sc_ec, sc_em, sm_em, sm_ec]
fig = make_subplots(rows=1,cols=2,start_cell="top-left", specs=[[{"type":"pie"},{"type":"pie"}]],
                    subplot_titles=["startswith","endswith"])

fig.add_trace(go.Pie(labels=labels1, values=values1,pull=[0.02,0.02]),row=1,col=1,)
fig.add_trace(go.Pie(labels=labels2, values=values2,pull=[0.02,0.02]),1,2)
#fig.add_trace(go.Pie(labels=labels3, values=values3,pull=[0.02,0.02,0.02,0.02]),2,1)
fig.update_layout(width=800,height=600)
fig.show()

<p>Left side figure tell us that 2/3 of train data notebooks starts with markdown cell and 1/3 of train data notebooks starts with code cell.In right figure we can see 60.4% notebooks ends with code cell and 39.6% notebooks ends with markdown cell</p>

In [None]:
fig = make_subplots(rows=1,cols=2,start_cell="top-left", specs=[[{"type": "pie"}, None]],
                    subplot_titles=["distribution"])
fig.add_trace(go.Pie(labels=labels3, values=values3,pull=[0.02,0.02,0.02,0.02]),1,1)
fig.update_layout(width=800,height=600)

<p>In 35.7% of notebooks starts with code cell and ends with code cell. 14.3% notebooks starts with code cell and ends with markdown cell. 27.3% notebooks starts with markdown cell and ends with code and 22.7% of times notebooks started with markddown cell and end with markdown cell</p>

In [None]:
c_next_to_c = sum(train_orders["n_c_next_to_c"].values) / len(train_orders)
c_next_to_m = sum(train_orders["n_c_next_to_m"].values) / len(train_orders)
m_next_to_c = sum(train_orders["n_m_next_to_c"].values) / len(train_orders)
m_next_to_m = sum(train_orders["n_m_next_to_m"].values) / len(train_orders)
labels = ['c_next_to_c', "c_next_to_m","m_next_to_c","m_next_to_m"]
count = [c_next_to_c, c_next_to_m, m_next_to_c, m_next_to_m]
fig = go.Figure(data=[go.Pie(labels=labels, values=count, pull=[0.02, 0.02, 0.02, 0.02])])
fig.show()

<p> We can see that avg </p>
<p>42.5% of times in notebooks there is code cell next to code cell.</p>
<p>26% of time there is code cell next to markdown cell.</p>
<p>24.4% of time there is markdown cell next to code cell</p>
<p>7.08% of time there is markdown cell next to markdown cell</p>

In [None]:

ids = train_orders['id'].values

fig = go.Figure(data=[
    go.Bar(name="num_code_cells", x=ids[:100], y=train_orders['total_code_cell'].values.tolist()[:100]),
    go.Bar(name="num_markdown_cells", x=ids[:100], y=train_orders['total_markdown_cell'].values.tolist()[:100]),
    
])
fig.update_layout(width=800,height=500,barmode='stack')
fig.show()

In [None]:
train_orders.describe()

In [None]:
count1 = train_orders['total_num_cell'].value_counts()
count2 = train_orders['total_code_cell'].value_counts()
count3 = train_orders['total_markdown_cell'].value_counts()
df1 = pd.DataFrame()
df1["size"] = count1.keys()
df1["count"]= count1.values

df2 = pd.DataFrame()
df2["size"] = count2.keys()
df2["count"]= count2.values

df3 = pd.DataFrame()
df3["size"] = count3.keys()
df3["count"]= count3.values

df1 = df1.sort_values("size")
df2 = df2.sort_values("size")
df3 = df3.sort_values("size")

x1 = df1["size"].values
y1 = df1["count"].values

x2 = df2["size"].values
y2 = df3["count"].values

x3 = df3["size"].values
y3 = df3["count"].values

#plt.plot(x1,y1)

### Cell Distributions

In [None]:
fig = make_subplots(rows=2,cols=2,start_cell="top-left",
                    subplot_titles=["total_num_cell","","num_code","num_markdown"])

fig.add_trace(
    go.Scatter(x=x1,y=y1, mode="lines",name='total_num_cell'),1,1
)
fig.add_trace(
    go.Scatter(x=x2,y=y2, mode="lines",name='num_code_cell'),2,1
)
fig.add_trace(
    go.Scatter(x=x3,y=y3, mode="lines",name='num_markdown_cell'),2,2
)

# fig.add_trace(
#     go.Scatter(x=x1,y=y1, mode="lines",name='total_num_cell'),3,1
# )
# fig.add_trace(
#     go.Scatter(x=x2,y=y2, mode="lines",name='num_code_cell'),3,1
# )
# fig.add_trace(
#     go.Scatter(x=x3,y=y3, mode="lines",name='num_markdown_cell'),3,1
# )
fig.update_layout(width=800,height=800)
fig.show()

<p>In top left we can see distribution of total_num_cells in train data there is only one notebooks which contain 1005 number of cells which is highestin number of cells per notebook and there are 392 notebooks which contain only 2 cells per notebook which is lowest in number of cells per notebook</p>
<p>There is 90% notebooks have less than 100 cells per notebook and only 10% of notebooks have more than 100 cells in noteboooks</p>
<p>Bottem left figure is distribution of number of code cells in train data we can see ~11k notebooks contains only 1 code cell in notebook and bottem rigth figure is distribution of number of markdown cells in train_data in this also we can see there are ~11k notebooks contain only one markdown cell</p>
<p>Don't you think distribution of number of code cells in train data is same as number of markdown cells in train data. So let's plot all three distribution in one plot.</p>

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=x1,y=y1, mode="lines",name='total_num_cell')
)
fig.add_trace(
    go.Scatter(x=x2,y=y2, mode="lines",name='num_code_cell')
)
fig.add_trace(
    go.Scatter(x=x3,y=y3, mode="lines",name='num_markdown_cell')
)
fig.show()

<p>Boom 🔥🔥🔥</p>
<p>we can see that distribution line of `number of markdown cell` is perfectly on distribution line of `number of code cell` in train data</p>