In [2]:
from langdetect import detect
import re
from Scripts import loading as dl, profiling as pf, cleaning as cl
import matplotlib.pyplot as plt

db_path = '../Data/phonereviews.db'
db_name = 'phonereviews'
df = dl.load_from_db(db_path, db_name)


print('######## FIRST GENERAL INSIGHT')
print(df.head())
pf.get_review_count(df)
pf.get_descr(df)


print('######## DATA COMPLETENESS')
pf.get_missing_text(df)
pf.get_missing_label(df)
df = pf.get_missing_label_implicit(df)
df = cl.proceed_data_completion(df)


print('######## DUPLICATE DETECTION')
pf.get_duplicates(df)
df = cl.drop_duplicates(df)


print('######### OTHER HEURISTICS')
# johnpacker - musical instrument shop
df = df[~df['URL'].str.contains("johnpacker")]
df = df[~df['REVIEWBODY'].str.contains("saxophone")] #14 entries
# Remove entries from unwanted domains connected to phone
df = df[~(df['URL'].str.contains('repair'))]   #1641
df = df[~(df['URL'].str.contains('travel'))]   #31
df = df[~(df['URL'].str.contains('clinic'))]   #2
df = df[~(df['URL'].str.contains('service'))]
df = df[~(df['URL'].str.contains('apps'))]
df = df[~(df['URL'].str.contains('phoneky'))]  #31846
df = df[~(df['URL'].str.contains('iphoneringtones'))] #73
df = df[~(df['URL'].str.contains('leonardoverona.it'))] #1
df = df[~(df['URL'].str.contains('restauranterosildos.com'))] #1
df = df[~(df['URL'].str.contains('www.grosbill.com'))] #7
df = df[~(df['URL'].str.contains('www.thataesportes.com.br/'))] #1
df = df[~(df['URL'].str.contains('www.alltrails.com'))] #27
df = df[~(df['URL'].str.contains('www.cabinet-fredericandco.fr'))] #1
df = df[~(df['URL'].str.contains('hargeysa.org'))] #1
df = df[~(df['URL'].str.contains('insiderpages.com'))]
df = df[~(df['URL'].str.contains('ringtone'))]
print("After removing other related mistakes: ", len(df))

######## FIRST GENERAL INSIGHT
                                     NODE  \
0    _:node89ce3018196554d41bceaeac30a31e   
1   _:nodec8b9fba77ad36d9d11460c11f9151d9   
2  _:node46ac847e2080dc78cf9ae447647ac8d4   
3     _:nodef160bce2dcba31bd02d1a072e6ad2   
4  _:nodef4bf4b94afaabab1bbb342d3ccc284af   

                                                 URL  \
0  https://www.techworld.com/review/smartphones/l...   
1  https://www.technospain.es/fundas-smartphones-...   
2  https://www.technospain.es/fundas-smartphones-...   
3  https://www.technospain.es/fundas-smartphones-...   
4  https://www.technospain.es/fundas-smartphones-...   

                                                text REVIEWRATING BESTRATING  \
0  "The smartphone market is as competitive as ev...         None       None   
1                                               None    "5"@es-es  "5"@es-es   
2                                               None    "5"@es-es  "5"@es-es   
3                                        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mis_rat'] = df.apply(checkmisrat, axis=1)


KeyError: 'REVIEWBODY'

In [None]:
# LINGUISTIC AFFILIATION
print('######## LINGUISTIC AFFILIATION')
def detectLang(row):
    try:
        return detect(row['REVIEWBODY'])
    except:
        print("exception:", row['REVIEWBODY'])
    else:
        print("sad:", row['REVIEWBODY'])
df['LANGUAGE'] = df.apply(detectLang, axis=1)
print(df['LANGUAGE'].value_counts())

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
x = df['LANGUAGE'].value_counts().index
y = df['LANGUAGE'].value_counts()
ax.bar(x,y)
plt.xlabel('Language')
plt.ylabel('Number of reviews')
plt.title(f'Language distribution')
plt.rcParams.update({"figure.facecolor": "white"})
fig.savefig("../Figures/schema_phone_lang_dist.png", bbox_inches='tight', dpi=300)
plt.show()

df = df[df['LANGUAGE'] == "en"]
print("After removing non-english text:", len(df))

# RATING SCALE ADJUSTMENT
print('######## RATING SCALE ADJUSTMENT')


def chForm(value):
    value = re.sub(r'[a-zA-Z]|@|%| |"|-|\\', '', value)
    if value != "":
        value = float(value)
    else:
        print('did not work for: ', value)
    return value
df['REVIEWRATING_new'] = df.apply(lambda row: chForm(row['REVIEWRATING']), axis=1)
df['BESTRATING_new']= df.apply(lambda row: chForm(row['BESTRATING']), axis=1)
df['WORSTRATING_new']= df.apply(lambda row: chForm(row['WORSTRATING']), axis=1)


# def adjust_rating(n, range1, range2):
#     try:
#         if n > range1[1]:
#             return range2[1]
#         delta1 = range1[1] - range1[0]
#         delta2 = range2[1] - range2[0]
#         result =  (round(delta2 * (n - range1[0]) / delta1) + range2[0])
#         if result == 0:
#             result = 1
#         return result
#     except ZeroDivisionError:
#         if range1 == [5,5]:
#             return  5



df['REVIEWRATING_adj'] = df.apply(lambda row: adjust_rating(row['REVIEWRATING_new'], [row['WORSTRATING_new'],row['BESTRATING_new']], [1.0,5.0]), axis=1)

print(df['REVIEWRATING_adj'].value_counts(normalize=True))
print(df['REVIEWRATING_adj'].value_counts())

print('######### LAST CHECK')
print(df.head())
print(df.describe(include='all'))

print('######## STORING')
df = df[['NODE','URL', 'REVIEWBODY','REVIEWRATING_adj']]
df.columns=['NODE', 'URL','text', 'label']
df.to_pickle('../Data/schema_phone.pkl')
print('to pickle done')