In [1]:
import logging
import sqlite3
import re
import gzip
import pandas as pd

In [2]:
# Input
movies_path = '../Data/input/schema_Movie.gz'
reviews_path= '../Data/input/schema_Review.gz'
inputlist = [movies_path,reviews_path]

# Output
moviereviewLog_path = '../Logs/movieReviews1.log'
moviereviews_path = '../Data/output/moviereviews.db'

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=moviereviewLog_path, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [3]:
conn = sqlite3.connect(moviereviews_path)
c = conn.cursor()
c.execute("""
        CREATE TABLE IF NOT EXISTS
            moviereviews(
                NODE TEXT,
                URL TEXT,
                REVIEWBODY TEXT,
                RATING TEXT,
                REVIEWRATING TEXT,
                BESTRATING TEXT,
                WORSTRATING TEXT,
                PRIMARY KEY (NODE, URL))
    """)

<sqlite3.Cursor at 0x297cd8f5a40>

In [4]:
taxo_pattern = re.compile("<http://schema.org/Review/description>|"\
                          "<http://schema.org/Review/reviewBody>|"\
                          "<http://schema.org/Review/reviewRating>|"\
                          "<http://schema.org/Rating/worstRating>|"\
                          "<http://schema.org/Rating/bestRating>|"\
                          "<http://schema.org/Rating/ratingValue>", re.IGNORECASE)
split_pattern = re.compile("^(_:.*)\s<(.*)>\s(.*)\s<(.*)>\s\.$", re.IGNORECASE)

url_lst = ['https://in.bookmyshow.com', 'https://thereviewmonk.com',
           'https://www.noopler.com', 'http://reviewschview.com',
           'https://www.flickfilosopher.com', 'https://www.rogerebert.com',
           'https://deepfocusreview.com', 'https://www.telegraph.co.uk',
           'https://nationalpost.com', 'https://www.imdb.com']

for input in inputlist:
    for url in url_lst:
        url_pattern = re.compile(url,re.IGNORECASE)
        with gzip.open(input,"rt") as f:
            i = 0
            detected = 0
            skipped = 0
            not_taxo = 0
            not_url = 0
            inserted = 0
            failed_updates = 0
            head = [next(f) for x in range(5000000)]
            # for line in f:
            for line in head:
                i += 1
                if not url_pattern.search(line):
                    not_url += 1
                    continue
                if not taxo_pattern.search(line):
                    not_taxo
                    continue
                match = split_pattern.match(line)
                if match is None:
                    skipped += 1
                    continue
                detected += 1
                props = match.groups()
                subj = props[0]
                predicate = props[1]
                obj = props[2]
                source = props[3]
                c.execute("INSERT OR IGNORE INTO MOVIEREVIEWS (NODE, URL) VALUES (?,?);",(subj, source))
                update_query = "UPDATE moviereviews SET "
                params = [obj, subj, source]
                # print(line)
                if predicate.lower() == "http://schema.org/Review/reviewBody".lower():
                    update_query += "REVIEWBODY = ? "
                if predicate.lower() == "http://schema.org/Review/description".lower():
                    update_query += "REVIEWBODY = ? "
                if predicate.lower() == "http://schema.org/Review/reviewRating".lower():
                    update_query += "RATING = ? "
                if predicate.lower() == "http://schema.org/Rating/ratingValue".lower():
                    update_query += "REVIEWRATING = ? "
                if predicate.lower() == "http://schema.org/Rating/bestRating".lower():
                    update_query += "BESTRATING = ? "
                if predicate.lower() == "http://schema.org/Rating/worstRating".lower():
                    update_query += "WORSTRATING = ? "
                update_query += "WHERE NODE = ? AND URL = ?;"
                try:
                    # print(update_query, params)
                    c.execute(update_query, params)
                    inserted += 1
                except:
                    logging.debug("failed to execute for params " + str(props))
                    failed_updates += 1

logging.debug("detected " + str(detected) + ", inserted: " + str(inserted) +" lines out of " + str(i) + "; not in taxo: " + str(not_taxo))
logging.debug("Done processing the movie reviews file from Movie.gz and Review.gz after the database got deleted")

conn.commit()
conn.close()

KeyboardInterrupt: 

In [8]:
conn.commit()
conn.close()

In [4]:
df = pd.read_sql_query("SELECT * from moviereviews", conn)
df.head()

Unnamed: 0,NODE,URL,REVIEWBODY,RATING,REVIEWRATING,BESTRATING,WORSTRATING
0,_:nodea456ab92b2669279485e1ddc64944bf0,https://in.bookmyshow.com/amritsar/movies/hous...,"""Housefull 4 is about the mistaken identities,...",_:nodee7b066aef1465ef6796152d5b23eefd,,,
1,_:nodee7b066aef1465ef6796152d5b23eefd,https://in.bookmyshow.com/amritsar/movies/hous...,,,"""Null""@en","""Null""@en","""2""@en"
2,_:node3b52118bf545246233aa662aad8a194,https://in.bookmyshow.com/amritsar/movies/hous...,"""Overall, Housefull 4 ends up as a complete ma...",_:nodef82d8af1d090997015dbe94ab45ad,,,
3,_:nodef82d8af1d090997015dbe94ab45ad,https://in.bookmyshow.com/amritsar/movies/hous...,,,"""Null""@en","""Null""@en","""2""@en"
4,_:nodeecc0f025103514ba011df1542adef7b,https://in.bookmyshow.com/amritsar/movies/hous...,"""HOUSEFULL 4 is a major disappointment and suf...",_:node5e5e1ef8427cb3f95ead0badefcd4c8,,,


In [11]:
merge_query = "UPDATE MOVIEREVIEWS T, "\
    "(SELECT DISTINCT NODE, URL, RATING, REVIEWRATING, BESTRATING, WORSTRATING FROM MOVIEREVIEWS) T1 "\
    "SET T.REVIEWRATING = T1.REVIEWRATING,"\
    "T.BESTRATING = T1.BESTRATING,"\
    "T.WORSTRATING = T1.WORSTRATING"\
    "WHERE T.RATING = T1.NODE AND T.URL = T1.URL;"

# merge_query = "UPDATE MOVIEREVIEWS T, (SELECT DISTINCT NODE, URL, RATING, REVIEWRATING, BESTRATING, WORSTRATING) T1 SET T.REVIEWRATING = T1.REVIEWRATING, T.BESTRATING = T1.BESTRATING, T.WORSTRATING = T1.WORSTRATING WHERE T.RATING = T1.NODE AND T.URL = T1.URL;"
c.execute(merge_query)


OperationalError: near "T": syntax error