In [1]:
import sqlite3
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from langdetect import detect
# from zipfile import ZipFile
# from collections import Counter
import re

In [2]:
# Input
# db_path = '../Data/moviewreviews.db'
# db_name = 'moviereviews'
db_path = '../Data/phonereviews.db'
db_name = 'phonereviews'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

df = pd.read_sql_query("SELECT * from " + str(db_name), conn)
print("ACTION: raw data:")
print(df.describe())

print("ACTION: without duplicate rows: ")
df.dropna()
print(df.describe())

print("ACTION: drop rows where any of the values is null or invalid")
df = df[df.REVIEWBODY.notnull()&df.RATING.notnull()&df.REVIEWRATING.notnull()&df.BESTRATING.notnull()&df.WORSTRATING.notnull()]
df = df[~df['REVIEWRATING'].str.contains("ull")]
df = df[~df['BESTRATING'].str.contains("ull")]
df = df[~df['WORSTRATING'].str.contains("ull")]
df = df[~df['REVIEWRATING'].str.contains("editorReview.rating")]
df = df[~df['REVIEWRATING'].str.contains("missing value")]
df = df[~df['URL'].str.contains("johnpacker")]
df = df[~df['URL'].str.contains("dougshop")]
print(df.describe())

print("ACTION: drop duplicates on Reviewbody")
df = df.drop_duplicates(subset='REVIEWBODY', keep="first")
print(df.describe())

print("ACTION: filter out only as english detected reviewbody")
# detect the language of the reviewbody
def detectLang(row):
    try:
        return detect(row['REVIEWBODY'])
    except:
        print("exception:", row['REVIEWBODY'])
    else:
        print("sad:", row['REVIEWBODY'])
df['LANGUAGE'] = df.apply(detectLang, axis=1)

df = df[df['LANGUAGE'] == "en"]
df.describe()

ACTION: raw data:
                                          NODE  \
count                                   809937   
unique                                  809853   
top     _:node8161c9599d5063951e373c3ff7e77db1   
freq                                         2   

                                          URL           REVIEWBODY     RATING  \
count                                  809937               316424     375232   
unique                                  88282                73748     368907   
top     https://my-phone-finder.com/find-imei  "Cancel this order"  \n\n5 / 5   
freq                                     3590                31383       2898   

       REVIEWRATING BESTRATING WORSTRATING  
count        735137     283704      190129  
unique         2086        210         158  
top             "5"        "5"         "0"  
freq         387649      66637       53927  
ACTION: without duplicate rows: 
                                          NODE  \
count            

Unnamed: 0,NODE,URL,REVIEWBODY,RATING,REVIEWRATING,BESTRATING,WORSTRATING,LANGUAGE
count,24302,24302,24302,24302,24302,24302,24302,24302
unique,24302,7041,24302,21040,495,75,74,1
top,_:node8556e2242f414883e88fca4411e4e13b,https://www.proporta.com/ipod-nano-7g-case-lea...,The Good: Sharp design. Excellent WP7 user exp...,\n\n5 / 5,"""5""@en-US","""5""@en","""1""@en",en
freq,1,376,1,1436,5714,6734,6920,24302


In [3]:
df['REVIEWRATING'].value_counts().index.tolist()
# df['BESTRATING'].value_counts().index.tolist()
# df['WORSTRATING'].value_counts().index.tolist()

['"5"@en-US',
 '"5"@en',
 '"5"',
 '5',
 '"4"@en',
 '3.5',
 '"4"',
 '4',
 '"5"@en-us',
 '"4"@en-us',
 '3',
 '"8"',
 '"1"',
 '"7"',
 '" 5 "',
 '"3"',
 '"3"@en',
 '"2"',
 '4.5',
 '"9"',
 '"4"@en-US',
 '"6"',
 '" 4 "',
 '2.5',
 '"10"@en',
 '"5"@vi',
 '"8"@en-US',
 '"3"@en-US',
 '"1"@en',
 '"100%"@en',
 '"0"@en-US',
 '0',
 '"9"@en-US',
 '"5"@en-gb',
 '"5"@de-DE',
 '"5"@en-GB',
 '"95"@en-US',
 '"10"',
 '"7"@en-US',
 '"2"@en',
 '"4.5"@en',
 '"5"@fr',
 '"8"@en',
 '"9"@en',
 '"80%"@en',
 '89',
 '90',
 '"1"@fr',
 '"94"@en-US',
 '92',
 '"5"@de',
 '"4"@fr',
 '"4.5"@en-US',
 '"4.5"',
 '82',
 '"6"@en-US',
 '88',
 '91',
 '"5.0"@en',
 '2',
 '"5"@pt-br',
 '"100"@en',
 '"\\n\\t\\t\\t\\t5\\n\\t\\t\\t"@en-US',
 '"\\n\\t\\t\\t\\t5\\n\\t\\t\\t"@en-GB',
 '93',
 '"5"@en-AU',
 '86',
 '"5"@fi',
 '87',
 '"3.8"',
 '" 5"@es',
 '84',
 '"96"@en-US',
 '80',
 '"5"@da',
 '"7"@en',
 '85',
 '"5"@tr',
 '"80%"',
 '83',
 '"90%"@en',
 '"4.2"',
 '"1"@en-US',
 '"5.0"@en-US',
 '"60%"@en',
 '"0"',
 '"75%"@en',
 '"5"@es',
 '"6"@e

In [4]:
def chForm(value):
    value = re.sub(r'[a-zA-Z]|@|%| |"|-|\\', '', value)
    # return value
    if value != "":
        value = float(value)
    else:
        value = float(0)
    return round(value)
df['REVIEWRATING_new'] = df.apply(lambda row: chForm(row['REVIEWRATING']), axis=1)
df['BESTRATING_new']= df.apply(lambda row: chForm(row['BESTRATING']), axis=1)
df['WORSTRATING_new']= df.apply(lambda row: chForm(row['WORSTRATING']), axis=1)
# try:
#         int(value)
#     except:
#         # print(value)

In [5]:
df['REVIEWRATING_new'].value_counts().index.tolist()
# df['BESTRATING_new'].value_counts().index.tolist()
# df['WORSTRATING_new'].value_counts().index.tolist()

[5,
 4,
 3,
 8,
 1,
 2,
 7,
 9,
 6,
 10,
 0,
 100,
 80,
 90,
 95,
 89,
 92,
 91,
 94,
 82,
 87,
 88,
 86,
 93,
 85,
 84,
 83,
 96,
 78,
 75,
 79,
 60,
 81,
 68,
 77,
 73,
 70,
 72,
 74,
 76,
 97,
 98,
 65,
 67,
 71,
 64,
 18,
 63,
 69,
 55,
 58,
 45,
 50,
 59,
 99,
 56,
 57,
 19,
 51,
 66,
 62,
 52,
 14,
 38,
 54,
 40,
 61,
 16,
 44]

In [56]:
df_special = df[(df['WORSTRATING_new']==5) & (df['BESTRATING_new']==5)]
print(df_special)

                                          NODE  \
691770  _:node6c4fd1e64e842ee999b038e1726441e4   
692561   _:node97a5355b9f12943ebbba2c63e807b4c   

                                                      URL  \
691770  http://landscape-photography-blog.com/Christma...   
692561  http://landscape-photography-blog.com/Beautifu...   

                                               REVIEWBODY  \
691770  "\u27A2 HDQ Cover Images CollPection: Christma...   
692561  "30/03/2017 Beautiful Wallpapers For Iphone \u...   

                                        RATING REVIEWRATING BESTRATING  \
691770   _:node040e432477113fe82d4a665703636ba    "5"@en-US  "5"@en-US   
692561  _:node1ea4555792615ff43136e1cea5b1b221    "5"@en-US  "5"@en-US   

       WORSTRATING LANGUAGE  REVIEWRATING_new  BESTRATING_new  \
691770   "5"@en-US       en                 5               5   
692561   "5"@en-US       en                 5               5   

        WORSTRATING_new  REVIEWRATING_adj  
691770            

In [66]:
def adjust_rating(n, range1, range2):
    try:
        if n > range1[1]:
            return range2[1]
        delta1 = range1[1] - range1[0]
        delta2 = range2[1] - range2[0]
        result =  (round(delta2 * (n - range1[0]) / delta1) + range2[0])
        if result == 0:
            result = 1
        return result
    except ZeroDivisionError:
        if range1 == [5,5]:
            return  5


df['REVIEWRATING_adj'] = df.apply(lambda row:
                                  adjust_rating(row['REVIEWRATING_new'], [row['WORSTRATING_new'],row['BESTRATING_new']], [1.0,5.0]), axis=1)

In [None]:
def renormalize(n, range1, range2):
    if n > range1[1]:
        return range2[1]
    delta1 = range1[1] - range1[0]
    delta2 = range2[1] - range2[0]
    return (delta2 * (n - range1[0]) / delta1) + range2[0]

print(round(renormalize(20.5, [1.0, 100.0], [1.0, 5.0])))

In [62]:
df['REVIEWRATING_new'].value_counts()

5     17828
4      3229
3       635
8       371
1       281
      ...  
54        1
40        1
61        1
16        1
44        1
Name: REVIEWRATING_new, Length: 69, dtype: int64

In [28]:
(df['REVIEWRATING_adj'] == 2).sum()

134

In [75]:
print(df['REVIEWRATING_adj'].value_counts(normalize=True))
print(df['REVIEWRATING_adj'].value_counts())

5.0    0.764423
4.0    0.174595
3.0    0.040861
1.0    0.010123
2.0    0.009999
Name: REVIEWRATING_adj, dtype: float64
5.0    18577
4.0     4243
3.0      993
1.0      246
2.0      243
Name: REVIEWRATING_adj, dtype: int64


In [9]:
df.describe()

Unnamed: 0,REVIEWRATING_new,BESTRATING_new,WORSTRATING_new,REVIEWRATING_adj
count,24302.0,24302.0,24302.0,19560.0
mean,7.911859,9.001811,0.778372,4.835174
std,15.586544,18.563918,0.418804,0.591476
min,0.0,1.0,0.0,1.0
25%,5.0,5.0,1.0,5.0
50%,5.0,5.0,1.0,5.0
75%,5.0,5.0,1.0,5.0
max,100.0,100.0,6.0,5.0


In [69]:
df.to_csv('../Data/phone_reviews.csv')

In [None]:
# finding out where the reviews come from (domain specific)
# def getNetloc(row):
#     try:
#         return urlparse(row['URL']).netloc
#     except:
#         print("expection: ", row['URL'])
#     else:
#         print("sad", row['URL'])
#
# df['netloc'] = df.apply(getNetloc, axis = 1)
# df.head()
# df['netloc'].value_counts().index.to_list()
# phone_lst = ['smartphone', 'phone',
#              'phone case', 'phone cable', 'phone charger', 'phone mount',
#              'cell phone', 'mobile phone']
#
#
# # df_phone = df_en[df_en['REVIEWBODY'].isin(phone_lst)]
# # print(df_phone.head())
# # LANGUAGE'] == "en"]
# df_en.head()

In [15]:
df['REVIEWBODY'].head()

32    "\n\n\nWhere to buy?Online Chinese retailer, G...
42    "Another useful gadget which has yet to be ful...
44    "Perfect for holding mobile phone on dashboard...
46    "Car easy use<br />\n<br />\nRecommend to all ...
56    "Got this for my girl friend as she always use...
Name: REVIEWBODY, dtype: object

In [20]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer

text ='"\n\n\nWhere to buy?Online Chinese retailer, Gearbest, sent us the sample and sells the Vernee V2 Pro for \u00A3237 (around $305) at the time of writing. Note that, while this price includes delivery, it is exclusive of any taxes that may be levied by HMRC or the courier companies on behalf of the vendor. Want to buy tech from online Chinese retailers? Read this first.\u00A0Rugged phones and elegant edge-to-edge displays used to be mutually exclusive, but with the Vernee V2 Pro, you can finally have both. The expansive 5.99-inch screen has a very slight Samsung-style curve at the edges and is protected by a slim, but strong magnalium bezel. Strong enough for the Chinese manufacturer to promise six-sided drop resistance. However, Vernee doesn\u2019t actually say from what height.\u00A0There is no doubting that the V2 Pro packs a lot of features for a phone costing only around \u00A3240 and when you add an IP68 rating, you have a tempting proposition for anyone who relies heavily on their phone while working outdoors.The battery is unusually large at 6.2Ah, which means you can keep working for around four days of regular use without recharging. It\u2019s faster than your typical tough phone too and has the latest version Android (Oreo) on board.\u00A0DesignThe sleek Vernee V2 Pro doesn\u2019t look like a rugged phone from the front at all, but that tough magnalium case is apparently enough to earn an IP68 certificate. That means it can survive 1.5-meters under water for two hours, is impervious to dust, and work in temperatures ranging from -30 to 60-degrees Centigrade. Vernee says it is also drop resistant on all sides, though there is no certificate for that. It is definitely available in red, or black, as seen here though.\u00A0The distinctively dimpled back panel is textured to enhance your grip and Vernee claims the material also repels dirt, although as you can see in the photo, we managed to find some that stuck. It certainly makes it easier to hold and so does the 18:9 ratio screen. A six-inch phone in 16:9 ratio would be harder to reach your fingers around. Also on the back are a dual-lens camera, LED flash/torch, a fingerprint reader and a heart rate monitor. This last is a welcome addition for an action phone and it works well.\u00A0There is only one socket at the side because the type C USB port works for sharing data, recharging and also as a headphone port. A USB-C to minijack adapter is included. This versatile port also allows for fast 9V/2A charging, which almost makes up for the fact that the Vernee V2 Pro does not support wireless charging.\u00A0There are only three buttons on the Vernee V2 Pro, an on/off button (of course), a volume rocker (naturally) and a button that takes a screenshot (why?). \u00A0We can only assume that this is a mistake that somehow made it all the way to production, because there are so many things that third button could have been: a camera shutter button perhaps, or PTT (push to talk), or an emergency SOS sender, or a torch app. When you\u2019re working outside there are plenty of functions that you\u2019d love to access without having to take off your gloves and navigate an onscreen menu, but taking a screenshot is NOT one of them. The problem is exacerbated by the fact that this button is on a hair trigger and you can\u2019t help but press it almost every time you hold the handset, so by the end of the review we had to delete hundreds of accidental screenshots. All that\u2019s needed is a firmware update, or an app to reassign this rogue function button, but at the time of review, there was no solution.SpecificationsSpec SheetHere are the full specs of the Vernee V2 Pro:\u00A0CPU:\u00A0Helio P23 MTK6763\u00A0GPU:\u00A0Mali G71 \u00A0\u00A0RAM:\u00A06GBStorage:\u00A064GBScreen size:\u00A05.9-inch\u00A0Resolution:\u00A02160 x 1080Weight:\u00A0259g\u00A0Dimensions:\u00A0164 x 79 x 12mmRear\u00A0camera:\u00A0\u00A016MP + 5MP\u00A0Front camera:\u00A08MP + 5MPOS:\u00A0Android 8.1.0Battery:\u00A06.2AhThe Vernee V2 Pro ticks most of the boxes on our feature list for a rugged phone and its technical specifications trump most of the other models in this category. The screen is a luxurious six inches across (or as near as makes no difference) and its Full HD, unlike the Blackview BV5800, which is only HD. The main camera is 21MP with a 5MP companion lens for getting that bokeh effect. Sadly there\u2019s no optical zoom.\u00A0In addition to the biometric fingerprint reader, you can also sign in with Face ID, which is less secure, but preferable when you\u2019ve got your hands full. The SIM card tray will actually hold two nano SIMs and a TF card, which could expand the phone\u2019s memory by128GB.\u00A0The processor is a modest Helio P23, but teamed with a generous 6GB of RAM and 64GB of ROM, it produced respectable results in our processing bench tests. It is certainly powerful enough to run Android Oreo 8.1 smoothly and drive apps that make good use of the phone\u2019s compass, pedometer and gyroscope.\u00A0At 6200 mAh the battery is both big and clever. It makes the Vernee V2 Pro feel as heavy as two Samsung Galaxy S9s, but it makes it last for 35 days in standby. Curiously, the specifications are in some key cases slightly lower than those of the Vernee Active, which was launched first, even though this model is more expensive. It means that the slightly larger battery and screen are what you are paying for here.\n"@en'
# TODO: alle \n und @en.. müssen weg -> preprocessing
print(text)
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)
# def process_content():
#     try:
#         for i in tokenized[5:]:
#             words = nltk.word_tokenize(i)
#             tagged = nltk.pos_tag(words)
#             namedEnt = nltk.ne_chunk(tagged, binary=True)
#             namedEnt.draw()
#     except Exception as e:
#         print(str(e))
# process_content()
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent
sent = preprocess(text)

"


Where to buy?Online Chinese retailer, Gearbest, sent us the sample and sells the Vernee V2 Pro for £237 (around $305) at the time of writing. Note that, while this price includes delivery, it is exclusive of any taxes that may be levied by HMRC or the courier companies on behalf of the vendor. Want to buy tech from online Chinese retailers? Read this first. Rugged phones and elegant edge-to-edge displays used to be mutually exclusive, but with the Vernee V2 Pro, you can finally have both. The expansive 5.99-inch screen has a very slight Samsung-style curve at the edges and is protected by a slim, but strong magnalium bezel. Strong enough for the Chinese manufacturer to promise six-sided drop resistance. However, Vernee doesn’t actually say from what height. There is no doubting that the V2 Pro packs a lot of features for a phone costing only around £240 and when you add an IP68 rating, you have a tempting proposition for anyone who relies heavily on their phone while working outdoo

In [26]:
sent

[('``', '``'),
 ('Where', 'WRB'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('?', '.'),
 ('Online', 'NNP'),
 ('Chinese', 'NNP'),
 ('retailer', 'NN'),
 (',', ','),
 ('Gearbest', 'NNP'),
 (',', ','),
 ('sent', 'VBD'),
 ('us', 'PRP'),
 ('the', 'DT'),
 ('sample', 'NN'),
 ('and', 'CC'),
 ('sells', 'VBZ'),
 ('the', 'DT'),
 ('Vernee', 'NNP'),
 ('V2', 'NNP'),
 ('Pro', 'NNP'),
 ('for', 'IN'),
 ('£237', 'NNP'),
 ('(', '('),
 ('around', 'IN'),
 ('$', '$'),
 ('305', 'CD'),
 (')', ')'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('time', 'NN'),
 ('of', 'IN'),
 ('writing', 'VBG'),
 ('.', '.'),
 ('Note', 'NN'),
 ('that', 'IN'),
 (',', ','),
 ('while', 'IN'),
 ('this', 'DT'),
 ('price', 'NN'),
 ('includes', 'VBZ'),
 ('delivery', 'NN'),
 (',', ','),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('exclusive', 'JJ'),
 ('of', 'IN'),
 ('any', 'DT'),
 ('taxes', 'NNS'),
 ('that', 'WDT'),
 ('may', 'MD'),
 ('be', 'VB'),
 ('levied', 'VBN'),
 ('by', 'IN'),
 ('HMRC', 'NNP'),
 ('or', 'CC'),
 ('the', 'DT'),
 ('courier', 'JJR'),
 ('companies', 'NNS'

In [None]:
# brand: Vernee, Samsung, Apple, Huwei, HTC, Windows, Nokia, Blackberry, LG, Sony
# synonyms to phone accessories: gadget
# key words: mobile

In [71]:
df.describe(include = 'all')
# new_df=df[df.REVIEWRATING_adj.notnull()]


Unnamed: 0,NODE,URL,REVIEWBODY,RATING,REVIEWRATING,BESTRATING,WORSTRATING,LANGUAGE,REVIEWRATING_new,BESTRATING_new,WORSTRATING_new,REVIEWRATING_adj
count,24302,24302,24302,24302,24302,24302,24302,24302,24302.0,24302.0,24302.0,24302.0
unique,24302,7041,24302,21040,495,75,74,1,,,,
top,_:node8556e2242f414883e88fca4411e4e13b,https://www.proporta.com/ipod-nano-7g-case-lea...,The Good: Sharp design. Excellent WP7 user exp...,\n\n5 / 5,"""5""@en-US","""5""@en","""1""@en",en,,,,
freq,1,376,1,1436,5714,6734,6920,24302,,,,
mean,,,,,,,,,7.911859,9.001811,0.778372,4.673196
std,,,,,,,,,15.586544,18.563918,0.418804,0.695134
min,,,,,,,,,0.0,1.0,0.0,1.0
25%,,,,,,,,,5.0,5.0,1.0,5.0
50%,,,,,,,,,5.0,5.0,1.0,5.0
75%,,,,,,,,,5.0,5.0,1.0,5.0


In [73]:
df['REVIEWRATING_adj'].count()


24302