# Tools for big data - week 6 SQL vs NoSQL

Setting up sqlite3 db connections.

In [1]:
import sqlite3
import pprint
from pymongo import MongoClient
conn = sqlite3.connect('northwind.db')
conn.text_factory = lambda x: x.decode('latin-1')
c = conn.cursor()

In [2]:
# Select customer from customer table (test)
c.execute("SELECT * FROM Customers WHERE CustomerID = 'ALFKI'")
c.fetchone()

('ALFKI',
 'Alfreds Futterkiste',
 'Maria Anders',
 'Sales Representative',
 'Obere Str. 57',
 'Berlin',
 None,
 '12209',
 'Germany',
 '030-0074321',
 '030-0076545')

## SQL exercise 1

In [21]:
c.execute("""SELECT ProductName FROM Products WHERE ProductID in 
(SELECT ProductID FROM [Order Details] WHERE orderID in 
(SELECT orderID FROM Orders WHERE CustomerID = 'ALFKI'))""")

c.fetchall()

[('Aniseed Syrup',),
 ("Grandma's Boysenberry Spread",),
 ('Rössle Sauerkraut',),
 ('Chartreuse verte',),
 ('Spegesild',),
 ('Escargots de Bourgogne',),
 ('Raclette Courdavault',),
 ('Vegie-spread',),
 ('Flotemysost',),
 ('Lakkalikööri',),
 ('Original Frankfurter grüne Soße',)]

## SQL exercise 2

In [5]:
# Gets all products which the customer have made and the product info
c.execute("""SELECT COUNT(*), CustomerALFKI.OrderID, CustomerALFKI.CustomerID FROM 
            ( SELECT * FROM Orders WHERE CustomerID = 'ALFKI' ) CustomerALFKI 
            INNER JOIN 'Order Details' od
            on CustomerALFKI.OrderID = od.OrderID
            INNER JOIN 'Products' pd
            on od.ProductID = pd.ProductID
            GROUP BY od.OrderID
            HAVING COUNT(*) > 1""")
c.fetchall()

[(3, 10643, 'ALFKI'),
 (2, 10702, 'ALFKI'),
 (2, 10835, 'ALFKI'),
 (2, 10952, 'ALFKI'),
 (2, 11011, 'ALFKI')]

## SQL exercise 3

In [6]:
c.execute("""SELECT ProductID FROM [Order Details] WHERE Quantity < 2
               INTERSECT
               SELECT ProductID FROM Products""")

rows = c.fetchall()

for row in rows:
    print(row[0])

4
6
7
10
13
14
19
20
30
31
32
37
40
59
66
69
72


Setting up mongodb client and db connections

In [7]:
# Setting up mongo client
client = MongoClient()
# Choosing db
db = client.Northwind
# Getting collections of interest
customers_collection = db['customers']
orders_collection = db['orders']
order_details_collection = db['order-details']
products_collection = db['products']
employees_collection = db['employees']

## Mongodb exercise 1
Aggregation for getting orders for customer ALFKI

In [8]:
orders_products_pipeline_for_customer = [
                                            { "$match": { "CustomerID": "ALFKI" } },
                                            { "$lookup" : 
                                             {
                                                 "from" : "order-details",
                                                 "localField" : "OrderID",
                                                 "foreignField" : "OrderID",
                                                 "as" : "orders_for_customer"
                                             }
                                            },
                                            { "$lookup" : 
                                             {
                                                 "from" : "products",
                                                 "localField" : "orders_for_customer.ProductID",
                                                 "foreignField" : "ProductID",
                                                 "as" : "products_for_order"
                                             }
                                            },
                                            { "$project" : 
                                             { 
                                                "CustomerID" : "$CustomerID", 
                                                "OrderID" : "$OrderID",
                                                "ProductsOrdered" : "$products_for_order.ProductID",
                                                "NumberOfOrders" : {"$size" : "$orders_for_customer" },
                                                "_id" : 0
                                             }
                                            }
                                        ]

In [9]:
pprint.pprint(list(orders_collection.aggregate(orders_products_pipeline_for_customer)))

[{'CustomerID': 'ALFKI',
  'NumberOfOrders': 3,
  'OrderID': 10643,
  'ProductsOrdered': [28, 39, 46]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 1,
  'OrderID': 10692,
  'ProductsOrdered': [63]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10702,
  'ProductsOrdered': [3, 76]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10835,
  'ProductsOrdered': [59, 77]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10952,
  'ProductsOrdered': [6, 28]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 11011,
  'ProductsOrdered': [58, 71]}]


## Mongodb exercise 2
Aggregation for getting orders with minimum two uniq products.

In [10]:
uniq_orders_products_pipeline_for_customer = [
                                                    { "$match" : { "CustomerID": "ALFKI" } },
                                                    { "$lookup" : 
                                                     {
                                                         "from" : "order-details",
                                                         "localField" : "OrderID",
                                                         "foreignField" : "OrderID",
                                                         "as" : "orders_for_customer"
                                                     }
                                                    },
                                                    { "$lookup" : 
                                                     {
                                                         "from" : "products",
                                                         "localField" : "orders_for_customer.ProductID",
                                                         "foreignField" : "ProductID",
                                                         "as" : "products_for_order"
                                                     }
                                                    },
                                                    { "$project" : 
                                                     { 
                                                        "CustomerID" : "$CustomerID", 
                                                        "OrderID" : "$OrderID",
                                                        "ProductsOrdered" : "$products_for_order.ProductID",
                                                        "NumberOfOrders" : {"$size" : "$orders_for_customer" },
                                                        "_id" : 0
                                                     }
                                                    },
                                                    { "$match" : { "NumberOfOrders" : { "$gte" : 2 } } }
                                                ]

In [12]:
pprint.pprint(list(orders_collection.aggregate(uniq_orders_products_pipeline_for_customer)))

[{'CustomerID': 'ALFKI',
  'NumberOfOrders': 3,
  'OrderID': 10643,
  'ProductsOrdered': [28, 39, 46]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10702,
  'ProductsOrdered': [3, 76]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10835,
  'ProductsOrdered': [59, 77]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 10952,
  'ProductsOrdered': [6, 28]},
 {'CustomerID': 'ALFKI',
  'NumberOfOrders': 2,
  'OrderID': 11011,
  'ProductsOrdered': [58, 71]}]


In [13]:
# Getting customer ALFKI document
#pprint.pprint(customers_collection.find_one({"CustomerID": "ALFKI"}))

In [14]:
#pprint.pprint(list(customers_collection.find()))

## Mongodb exercise 3
Use of map reduce

In [15]:
from bson.code import Code

In [16]:
myMap = Code("function () {"
                "emit(this.Country, 1);"
            "}")

In [17]:
myReduce = Code("function (country, count) {"
                   "return Array.sum(count)"
               "}")

In [18]:
res = customers_collection.map_reduce(myMap, myReduce, "CountryForCustomer")

In [19]:
for od in res.find():
    print(od)

{'_id': 8022.0, 'value': 1.0}
{'_id': 13008.0, 'value': 1.0}
{'_id': 28001.0, 'value': 1.0}
{'_id': 28023.0, 'value': 1.0}
{'_id': 28034.0, 'value': 1.0}
{'_id': 41101.0, 'value': 1.0}
{'_id': 44000.0, 'value': 2.0}
{'_id': 59000.0, 'value': 1.0}
{'_id': 67000.0, 'value': 1.0}
{'_id': 69004.0, 'value': 1.0}
{'_id': 75012.0, 'value': 1.0}
{'_id': 75016.0, 'value': 1.0}
{'_id': 78000.0, 'value': 1.0}
{'_id': '02389-673', 'value': 1.0}
{'_id': '02389-890', 'value': 1.0}
{'_id': '04876-786', 'value': 1.0}
{'_id': '05432-043', 'value': 1.0}
{'_id': '05442-030', 'value': 1.0}
{'_id': '05454-876', 'value': 1.0}
{'_id': '05487-020', 'value': 1.0}
{'_id': '05634-030', 'value': 1.0}
{'_id': '08737-363', 'value': 1.0}
{'_id': 'Argentina', 'value': 3.0}
{'_id': 'Austria', 'value': 2.0}
{'_id': 'B-6000', 'value': 1.0}
{'_id': 'Belgium', 'value': 1.0}
{'_id': 'Canada', 'value': 3.0}
{'_id': 'Denmark', 'value': 2.0}
{'_id': 'Finland', 'value': 2.0}
{'_id': 'France', 'value': 2.0}
{'_id': 'Germany', '