In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os

# Read the raw python files

In [4]:
# Start from an empty dataframe
df = pd.DataFrame()

# Retrieves a list of filepaths of all ipynb files (pre-converted into json by simply renaming their file extensions from .ipynb to .json)
all_file_paths = glob.glob('../data/sliced-notebooks-full-new/**/*.py',recursive=True)

for file_path in all_file_paths[0:10]:
    # Open the file
    with open(file_path, 'r') as file:
        data = file.read()
#         print(data)
    
    
        temp = pd.DataFrame({"source": data,
                             "filename":  os.path.splitext(os.path.basename(file_path))[0], # the filename is the filename
                             "competition":os.path.basename(os.path.dirname(file_path)) # the competition name is the directory name
                            },
                           index=[0])
        temp['name'] = temp['filename'].str.split("_").str[0].astype(int)
        temp['cell'] = temp['filename'].str.split("_").str[1].astype(int)

         

        df = df.append(temp)

In [5]:
print(df)

                                              source     filename  \
0  import numpy as np \nimport pandas as pd\nimpo...  1805310_100   
0  from subprocess import check_output\nprint(che...    2005202_0   
0  import numpy as np # linear algebra\nimport pa...   1772437_44   
0  import pandas as pd # data processing, CSV fil...    1632422_1   
0  import pandas as pd\ndir_1st_result = "../inpu...   1998847_19   
0  from datetime import date, timedelta\nt2017 = ...   45370411_2   
0  import numpy as np # linear algebra\nimport pa...    2042217_5   
0  import numpy as np \nimport pandas as pd\nimpo...   1805310_66   
0  import pandas as pd\ndf_items     = pd.read_cs...    1704082_7   
0  import pandas as pd\nitems = pd.read_csv('../i...    2252920_3   

                          competition      name  cell  
0  favorita-grocery-sales-forecasting   1805310   100  
0  favorita-grocery-sales-forecasting   2005202     0  
0  favorita-grocery-sales-forecasting   1772437    44  
0  favorita-groc

In [14]:
df.reset_index(drop=True,inplace=True)
df.sort_values(by=["name", "competition", "cell"], ascending=True,inplace=True)
df.to_csv("data/all-notebooks-sliced.csv")

# Read the usage CSVs

In [8]:
# Start from an empty dataframe
df = pd.DataFrame()

# Retrieves a list of filepaths of all ipynb files (pre-converted into json by simply renaming their file extensions from .ipynb to .json)
all_file_paths = glob.glob('../data/sliced-notebooks-full-new/**/*.csv',recursive=True)

# filter "schema.csv" in each directory
all_file_paths = [x for x in all_file_paths if ("schema" not in x)]

for file_path in all_file_paths[0:10]:
    # Open the file
    temp = pd.read_csv(file_path)
    temp["filename"] =  os.path.splitext(os.path.basename(file_path))[0]
    temp["competition"] = os.path.basename(os.path.dirname(file_path))
    temp['name'] = temp['filename'].str.split("_").str[0].astype(int)
    temp['cell'] = temp['filename'].str.split("_").str[1].astype(int)
    

    df = df.append(temp)
print(df)

   CELL                                              USAGE    filename  \
0     2      __builtins__, print, subprocess, check_output   1766093_0   
0    11                                   pandas, read_csv  1805310_19   
1    20              pandas, read_csv, __builtins__, print  1805310_19   
0    27                                       pandas, info  3621215_32   
1    28                                       pandas, head  3621215_32   
2    60  matplotlib.pyplot, figure, matplotlib.pyplot, ...  3621215_32   
3    95      __builtins__, plot_store_transactions_cluster  3621215_32   
4   101                 pandas, groupby, __builtins__, zip  3621215_32   
5   126  __builtins__, len, __builtins__, range, matplo...  3621215_32   
0     3                                   pandas, read_csv   1747966_4   
1     4                            matplotlib.pyplot, hist   1747966_4   
0     4                     pandas, read_csv, pandas, copy   2287641_4   
1    11  pandas, read_csv, pandas, cop

In [11]:
df.reset_index(drop=True,inplace=True)
df.sort_values(by=["name", "competition", "cell"], ascending=True,inplace=True)
df.to_csv("data/all-notebooks-sliced-usage.csv")

# Properly format the usage column

In [14]:
df_test = df
def combinePairs(x):
    l = x.replace(" ", "").split(",")
    return ",".join([i+"."+j for i,j in zip(l[::2], l[1::2])])

df_test["USAGE_CLEAN"] = df_usage['USAGE'].apply(combinePairs)

In [17]:
from collections import Counter
Counter(",".join(df_test["USAGE_CLEAN"]).split(",")).most_common(100)

[('__builtins__.print', 293204),
 ('pandas.read_csv', 249429),
 ('__builtins__.len', 177986),
 ('__builtins__.range', 149204),
 ('__builtins__.list', 77600),
 ('numpy.array', 66301),
 ('pandas.concat', 65324),
 ('__builtins__.enumerate', 65161),
 ('pandas.head', 58768),
 ('__builtins__.str', 58017),
 ('matplotlib.pyplot.subplots', 55543),
 ('__builtins__.int', 47410),
 ('numpy.zeros', 47164),
 ('pandas.merge', 44773),
 ('pandas.drop', 43954),
 ('matplotlib.pyplot.show', 43732),
 ('sklearn.model_selection.train_test_split', 42527),
 ('pandas.groupby', 35892),
 ('matplotlib.pyplot.plot', 35000),
 ('matplotlib.pyplot.title', 32536),
 ('numpy.arange', 31841),
 ('pandas.to_datetime', 28151),
 ('matplotlib.pyplot.figure', 27968),
 ('__builtins__.zip', 24537),
 ('matplotlib.pyplot.subplot', 22531),
 ('__builtins__.dict', 21509),
 ('__builtins__.set', 21284),
 ('numpy.mean', 20874),
 ('sklearn.calibration.CalibratedClassifierCV', 20435),
 ('matplotlib.pyplot.xlabel', 19358),
 ('matplotlib.pypl

In [18]:
df_test.to_csv("../data/all-notebooks-sliced-usage.csv")