In [1]:
import logging, os, sys
import sqlite3
import gzip
import re
from langdetect import detect
import pandas as pd
from urllib.parse import urlparse
import tldextract

In [2]:
# Input
reviews_path = '../Data/schema_Review.gz'

# Output
phonereviewLog_path = '../Logs/phoneReviews.log'
phonereviews_path = '../Data/input/phonereviews.db'

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=phonereviewLog_path, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [3]:
conn = sqlite3.connect(phonereviews_path)
c = conn.cursor()
c.execute("""
        CREATE TABLE IF NOT EXISTS
            phonereviews(
                NODE TEXT,
                URL TEXT,
                REVIEWBODY TEXT,
                RATING TEXT,
                REVIEWRATING TEXT,
                BESTRATING TEXT,
                WORSTRATING TEXT,
                PRIMARY KEY (NODE, URL))
    """)

<sqlite3.Cursor at 0x230490a21f0>

In [4]:
taxo_pattern = re.compile("<http://schema.org/Review/description>|"\
                          "<http://schema.org/Review/reviewBody>|"\
                          "<http://schema.org/Review/reviewRating>|"\
                          "<http://schema.org/Rating/worstRating>|"\
                          "<http://schema.org/Rating/bestRating>|"\
                          "<http://schema.org/Rating/ratingValue>", re.IGNORECASE)
split_pattern = re.compile("^(_:.*)\s<(.*)>\s(.*)\s<(.*)>\s\.$", re.IGNORECASE)
phone_lst = ['smartphone', 'phone',
             'phone case', 'phone cable', 'phone charger', 'phone mount',
             'cell phone']

In [None]:
for item in phone_lst:
        item_pattern = re.compile(item,re.IGNORECASE)
        with gzip.open(reviews_path,"rt") as f:
            i = 0
            detected = 0
            skipped = 0
            not_taxo = 0
            not_phone = 0
            inserted = 0
            failed_updates = 0
            head = [next(f) for x in range(150000)]
            for line in head:
            # for line in iter(f.readline, ""):
                i += 1
                match = split_pattern.match(line)
                if match is None:
                    skipped += 1
                    continue
                props = match.groups()
                subj = props[0]
                predicate = props[1]
                obj = props[2]
                source = props[3]
                if not item_pattern.search(obj):
                    not_phone += 1
                    continue
                try:
                    lang = detect(obj)
                except :
                    break
                if (lang == 'en'):
                    if not taxo_pattern.search(line):
                        not_taxo += 1
                        continue
                    c.execute("INSERT OR IGNORE INTO PHONEREVIEWS (NODE, URL) VALUES (?,?);",(subj, source))
                    update_query = "UPDATE PHONEREVIEWS SET "
                    params = [obj, subj, source]
                    if predicate.lower() == "http://schema.org/Review/reviewBody".lower():
                        update_query += "REVIEWBODY = ? "
                    if predicate.lower() == "http://schema.org/Review/description".lower():
                        update_query += "REVIEWBODY = ? "
                    if predicate.lower() == "http://schema.org/Review/reviewRating".lower():
                        update_query += "RATING = ? "
                    if predicate.lower() == "http://schema.org/Rating/ratingValue".lower():
                        update_query += "REVIEWRATING = ? "
                    if predicate.lower() == "http://schema.org/Rating/bestRating".lower():
                        update_query += "BESTRATING = ? "
                    if predicate.lower() == "http://schema.org/Rating/worstRating".lower():
                        update_query += "WORSTRATING = ? "
                    update_query += "WHERE NODE = ? AND URL = ?;"
                    try:
                        c.execute(update_query, params)
                        conn.commit()
                        inserted += 1
                    except:
                        logging.debug("failed to execute for params " + str(update_query) + str(props))
                        failed_updates += 1

logging.debug("detected " + str(detected) + ", inserted: " + str(inserted) +" lines out of " + str(i) + "; not in taxo: " + str(not_taxo))
logging.debug("Done getting Review entries with Reviewbodies, descriptions or websites at the Object having phone or related words")

In [5]:
df = pd.read_sql_query("SELECT * from phonereviews", conn)
df.head()

Unnamed: 0,NODE,URL,REVIEWBODY,RATING,REVIEWRATING,BESTRATING,WORSTRATING
0,_:nodee31f94c7f56633383fffc75cba6d84e,https://fineartamerica.com/featured/44-sunsets...,"""I ordered this while overseas and I haven't s...",,,,
1,_:node68224ca24826992d53b8e6cc9e6544,https://fineartamerica.com/featured/eiffel-tow...,"""I ordered this while overseas and I haven't s...",,,,
2,_:node992d5be13d557f4fa467996b83b5305a,https://fineartamerica.com/featured/21l334-red...,"""I ordered this while overseas and I haven't s...",,,,
3,_:node24be2eb09e2851ebcca3cd18dd55b585,https://fineartamerica.com/featured/statue-of-...,"""I ordered this while overseas and I haven't s...",,,,
4,_:nodea2cd6cc380106ada70a5dee7b57329,https://fineartamerica.com/featured/2-gondolas...,"""I ordered this while overseas and I haven't s...",,,,


In [8]:
# geht ins framework und für jeden Wert
# for node in df.NODE:
node_pattern = re.compile('_:nodee31f94c7f56633383fffc75cba6d84e',re.IGNORECASE)
with gzip.open(reviews_path,"rt") as f:
        i = 0
        detected = 0
        skipped = 0
        inserted = 0
        failed_updates = 0
        description = 1
        rating = 0
        reviewrating = 0
        bestrating = 0
        worstrating = 0
        # ich brauche hier eine Abbruchbedinung
        # Wenn für description, rating, reviewrating, best und worstrating Wert vorliegt
        for line in f:
            if rating == 0 or reviewrating == 0 or bestrating==0 or worstrating==0:
                i += 1
                if not node_pattern.search(line):
                    continue
                if not taxo_pattern.search(line):
                    continue
                match = split_pattern.match(line)
                if match is None:
                    skipped += 1
                    continue
                detected += 1
                props = match.groups()
                subj = props[0]
                predicate = props[1]
                obj = props[2]
                source = props[3]
                c.execute("INSERT OR IGNORE INTO PHONEREVIEWS (NODE, URL) VALUES (?,?);",(subj, source))
                update_query = "UPDATE PHONEREVIEWS SET "
                params = [obj, subj, source]
                if predicate.lower() == "http://schema.org/Review/reviewBody".lower():
                    update_query += "REVIEWBODY = ? "
                if predicate.lower() == "http://schema.org/Review/description".lower():
                    update_query += "REVIEWBODY = ? "
                if predicate.lower() == "http://schema.org/Review/reviewRating".lower():
                    update_query += "RATING = ? "
                    rating = 1
                if predicate.lower() == "http://schema.org/Rating/ratingValue".lower():
                    update_query += "REVIEWRATING = ? "
                    reviewrating = 1
                if predicate.lower() == "http://schema.org/Rating/bestRating".lower():
                    update_query += "BESTRATING = ? "
                    bestrating = 1
                if predicate.lower() == "http://schema.org/Rating/worstRating".lower():
                    update_query += "WORSTRATING = ? "
                    worstrating = 1
                update_query += "WHERE NODE = ? AND URL = ?;"
                try:
                    c.execute(update_query, params)
                    print(update_query + params)
                    inserted += 1
                except:
                    logging.debug("failed to execute for params " + str(update_query) + str(props))
                    failed_updates += 1
            else:
                break

logging.debug("detected: " + str(detected)  + ", inserted: " + str(inserted) +" lines out of " + str(i))
logging.debug("Done getting related Review entries")