# Extraction

First, we will extract the review Ratings and review Body into a relational data base. 

In [1]:
import gzip
import re
import sqlite3
from datetime import datetime as dt

In [2]:
conn = sqlite3.connect("reviews.db")
with conn: 
    conn.execute("""
        CREATE TABLE IF NOT EXISTS
            taxonomies(
                NODE TEXT,
                URL TEXT,
                REVIEWBODY TEXT,
                REVIEWRATING TEXT,
                ITEMREVIEWED TEXT,
                REVIEWASPECT TEXT,
                PRIMARY KEY (NODE, URL))
    """)
       
    conn.execute("""
        CREATE TABLE IF NOT EXISTS
            reviews(NODE TEXT,
                URL TEXT,
                PRIMARY KEY (NODE, URL)
                )
    """)

In [3]:
taxo_pattern = re.compile("<http://schema.org/Review/reviewBody>|<http://schema.org/Review/ratingValue>|<http://schema.org/Review/itemReviewed>|<http://schema.org/Review/reviewAspect>|<http://schema.org/Product/category>", re.IGNORECASE)
split_pattern = re.compile("^(_:.*)\s(<http:\/\/schema\.org\/Review\/reviewBody>|<http:\/\/schema\.org\/Review\/ratingValue>|http:\/\/schema\.org\/Review\/itemReviewed>|http:\/\/schema\.org\/Review\/reviewAspect>)|<http:\/\/schema.org\/Product\/category>\s(.*)\s<(.*)>\s\.$", re.IGNORECASE)

with gzip.open("../Data/schema_Review.gz", "rt") as f:
    i = 0
    skipped_lines = 0
    inserted = 0
    for line in iter(f.readline, ""):
        if i % 5000000 == 0:
            print(str(dt.now()) + ": processed " + str(i / 1000000) + "/6321 million lines so far")
        i += 1
        if not taxo_pattern.search(line):
            continue
        match = split_pattern.match(line)
        if match is None:
            skipped_lines += 1
            continue
        props = match.groups()
        with conn:
            if len(props) != 4:
                print("properties have weird length: " + str(props))
                continue
            subject = props[0]
            predicate = props[1]
            obj = props[2]
            source = props[3]
            conn.execute("INSERT OR IGNORE INTO taxonomies (NODE, URL) VALUES (?, ?, ?, ?, ?)", (subject, source))
            update_query = "UPDATE taxonomies SET "
            params = [obj, subject, source]
            if predicate.lower() == "<http://schema.org/Review/reviewBody>".lower():
                update_query += "REVIEWBODY = ?"
            if predicate.lower() == "<http://schema.org/Review/ratingValue>".lower():
                update_query += "REVIEWRATING = ?"
            if predicate.lower() == "<http://schema.org/Review/itemReviewed>".lower():
                update_query += "ITEMREVIEWED = ?"
            if predicate.lower() == "<http://schema.org/Review/reviewAspect>".lower():
                update_query += "REVIEWASPECT = ?"
            if predicate.lower() == "<http://schema.org/Product/category>".lower():
                update_query += "PRODCATEGORY = ?"
            update_query += " WHERE NODE = ? AND URL = ?;"
            try:
                conn.execute(update_query, params)
                inserted += 1
            except:
                print("failed to execute for params " + str(props))

    print("inserted " + str(inserted) + " lines and skipped " + str(skipped_lines) + " lines")
    print("Done processing the review file")


2019-10-08 17:40:15.988124: processed 0.0/6321 million lines so far
2019-10-08 17:43:22.658230: processed 5.0/6321 million lines so far
2019-10-08 17:47:42.838536: processed 10.0/6321 million lines so far
2019-10-08 17:52:03.642134: processed 15.0/6321 million lines so far
2019-10-08 17:55:31.238323: processed 20.0/6321 million lines so far
2019-10-08 17:58:18.541122: processed 25.0/6321 million lines so far
2019-10-08 18:00:57.949343: processed 30.0/6321 million lines so far
2019-10-08 18:02:49.033059: processed 35.0/6321 million lines so far
2019-10-08 18:06:45.774066: processed 40.0/6321 million lines so far
2019-10-08 18:08:31.255944: processed 45.0/6321 million lines so far
2019-10-08 18:10:07.395088: processed 50.0/6321 million lines so far
2019-10-08 18:13:03.895640: processed 55.0/6321 million lines so far
2019-10-08 18:15:37.719227: processed 60.0/6321 million lines so far
2019-10-08 18:18:17.518201: processed 65.0/6321 million lines so far
2019-10-08 18:21:51.948708: processe