In [36]:
%pip install docker pymongo pandas

You should consider upgrading via the '/home/david/.pyenv/versions/3.8.6/envs/python-data-prep/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


Setup mongo and import data

In [37]:
import os
import docker

docker_client = docker.from_env()
MONGO_PORT = 27018

mongo_container = docker_client.containers.run(
    'mongo:latest',
    detach=True,
    name='mongo-data-prep',
    remove=True,
    ports={'27017/tcp': MONGO_PORT},
    mem_limit='2G',
    volumes={
        os.path.join(os.getcwd(), 'db'): {'bind': '/data/db', 'mode': 'rw'},
    }
)

In [38]:
from pymongo import MongoClient
mongo_client = MongoClient('127.0.0.1', MONGO_PORT)
db = mongo_client['data-prep']

In [39]:
sales_small_col = db.get_collection('sales-small')
sales_small_col.drop()

In [40]:
from pathlib import Path
import json
sales_small_data = json.loads(Path('/home/david/dev/toucan-toco/python-data-prep/data/sales-small.json').read_text())
sales_small_col.insert_many(sales_small_data['data'])

<pymongo.results.InsertManyResult at 0x7fda33925380>

In [41]:
import pandas as pd

1. Simulating a simple weaverbird pipeline with just a filter, on a very small domain:

In [42]:
'''
{
    "domain": "sales-small",
    "name": "domain"
},
'''
def get_domain(domain):
    return pd.DataFrame(db.get_collection(domain).find({}))
df = get_domain('sales-small')

In [43]:
'''
{
    "name": "filter",
    "condition": {
      "column": "Payment_Type",
      "operator": "eq",
      "value": "Mastercard"
    }
}
'''
def filter(df, column, value):
    return df[df[column] == value]
df1 = filter(df, 'Payment_Type', 'Mastercard')

In [45]:
%%time

'''
Whole pipeline
'''

df = get_domain('sales-small')
filter(df, 'Payment_Type', 'Mastercard')

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 10.6 ms


Unnamed: 0,_id,Transaction_date,Product,Price,Payment_Type,Name,City,State,Country,Account_Created,Last_Login,Latitude,Longitude
1,5f90516bacd166c1735d9249,1/5/09 4:10,Product1,1200,Mastercard,Nicola,Roodepoort,Gauteng,South Africa,1/5/09 2:33,1/7/09 5:13,-26.166667,27.866667
2,5f90516bacd166c1735d924a,1/4/09 13:17,Product1,1200,Mastercard,Renee Elisabeth,Tel Aviv,Tel Aviv,Israel,1/4/09 13:03,1/4/09 22:10,32.066667,34.766667
3,5f90516bacd166c1735d924b,1/2/09 6:17,Product1,1200,Mastercard,carolina,Basildon,England,United Kingdom,1/2/09 6:00,1/2/09 6:08,51.5,-1.116667
6,5f90516bacd166c1735d924e,1/5/09 8:58,Product2,3600,Mastercard,Marcia,Telgte,Nordrhein-Westfalen,Germany,9/1/08 3:39,1/14/09 2:07,52.333333,7.9
9,5f90516bacd166c1735d9251,1/15/09 12:54,Product1,1200,Mastercard,Annelies,Ile-Perrot,Quebec,Canada,1/15/09 12:22,1/16/09 7:53,45.4,-73.933333
15,5f90516bacd166c1735d9257,1/12/09 15:12,Product1,1200,Mastercard,David,Deptford,NJ,United States,1/12/09 14:07,1/19/09 3:47,39.83806,-75.15306
16,5f90516bacd166c1735d9258,1/19/09 16:10,Product1,1200,Mastercard,Frank,Old Greenwich,CT,United States,1/19/09 15:31,1/19/09 16:00,41.02278,-73.56528
17,5f90516bacd166c1735d9259,1/20/09 6:03,Product1,1200,Mastercard,Andrea,Shreveport,LA,United States,1/20/09 5:13,1/20/09 7:15,32.525,-93.75
26,5f90516bacd166c1735d9262,1/4/09 9:54,Product1,1200,Mastercard,AMY,The Woodlands,TX,United States,12/30/08 20:41,1/25/09 18:23,30.15778,-95.48917
27,5f90516bacd166c1735d9263,1/22/09 15:32,Product1,1200,Mastercard,Tara,Killiney,Dublin,Ireland,2/27/07 11:35,1/26/09 4:32,53.252222,-6.1125


2. Simulating a groupby, keeping the original format, on larger domain

In [58]:
sales_col = db.get_collection('sales')
sales_col.drop()
data = pd.read_csv('/home/david/dev/toucan-toco/python-data-prep/data/sales.csv').to_dict(orient='rows')
sales_col.insert_many(data)




<pymongo.results.InsertManyResult at 0x7fda19b99a80>

In [75]:
%%time

'''
[
  {
    "domain": "sales",
    "name": "domain"
  },
  {
    "name": "aggregate",
    "on": [
      "DATE"
    ],
    "aggregations": [
      {
        "columns": [
          "WEEKLY_SALES"
        ],
        "newcolumns": [
          "Transaction_date-sum"
        ],
        "aggfunction": "sum"
      }
    ],
    "keepOriginalGranularity": true
  }
]
'''

df = get_domain('sales')
df['WEEKLY_SALES-sum'] = df.groupby(by=['DATE'])['WEEKLY_SALES'].transform('sum')
df

CPU times: user 2.9 s, sys: 22.1 ms, total: 2.92 s
Wall time: 3.1 s


Unnamed: 0,_id,STORE_ID,DEPT_ID,DATE,WEEKLY_SALES,WEEKLY_SALES-sum
0,5f90538dacd166c1735d927a,1,1,05/02/2010,24924.50,49750740.50
1,5f90538dacd166c1735d927b,1,1,12/02/2010,46039.49,48336677.63
2,5f90538dacd166c1735d927c,1,1,19/02/2010,41595.55,48276993.78
3,5f90538dacd166c1735d927d,1,1,26/02/2010,19403.54,43968571.13
4,5f90538dacd166c1735d927e,1,1,05/03/2010,21827.90,46871470.30
...,...,...,...,...,...,...
421565,5f905391acd166c173640137,45,98,28/09/2012,508.37,43734899.40
421566,5f905391acd166c173640138,45,98,05/10/2012,628.10,47566639.31
421567,5f905391acd166c173640139,45,98,12/10/2012,1061.02,46128514.25
421568,5f905391acd166c17364013a,45,98,19/10/2012,760.01,45122410.57


In [82]:
%%time
'''Compare with mongo'''

list(sales_col.aggregate([
		{
			"$group": {
				"_id": {
					"DATE": "$DATE"
				},
				"_vqbDocsArray": {
					"$push": "$$ROOT"
				},
				"WEEKLY_SALES-sum": {
					"$sum": "$WEEKLY_SALES"
				}
			}
		},
		{
			"$unwind": "$_vqbDocsArray"
		},
		{
			"$replaceRoot": {
				"newRoot": {
					"$mergeObjects": [
						"$_vqbDocsArray",
						"$$ROOT"
					]
				}
			}
		},
		{
			"$project": {
				"_id": 0,
				"_vqbDocsArray": 0
			}
		}
	]
, allowDiskUse=True))

CPU times: user 1.1 s, sys: 11.1 ms, total: 1.11 s
Wall time: 3.89 s


[{'STORE_ID': 1,
  'DEPT_ID': 1,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 20398.09,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 2,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 46991.58,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 3,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 8734.19,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 4,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 34451.9,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 5,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 23598.55,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 6,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 3249.27,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 7,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 20144.71,
  'WEEKLY_SALES-sum': 43458991.190000005},
 {'STORE_ID': 1,
  'DEPT_ID': 8,
  'DATE': '01/04/2011',
  'WEEKLY_SALES': 35319.05,
  'WEEKLY_SALES-sum': 