In [1]:
import re
from Scripts import loading as dl, profiling as pf, cleaning as cl, preprocessing as prep

db_path = '../Data/phonereviews.db'
db_name = 'phonereviews'
df = dl.load_from_db(db_path, db_name)


print('######## FIRST GENERAL INSIGHT')
print(df.head())
pf.get_review_count(df)
pf.get_descr(df)


print('######## DATA COMPLETENESS')
pf.get_missing_text(df)
pf.get_missing_label(df)
df = pf.get_missing_label_implicit(df)
df = cl.proceed_data_completion(df)


print('######## DUPLICATE DETECTION')
pf.get_duplicates(df)
df = cl.drop_duplicates(df)


print('######### OTHER HEURISTICS')
# johnpacker - musical instrument shop
df = df[~df['URL'].str.contains("johnpacker")]
df = df[~df['text'].str.contains("saxophone")] #14 entries
# Remove entries from unwanted domains connected to phone
df = df[~(df['URL'].str.contains('repair'))]   #1641
df = df[~(df['URL'].str.contains('travel'))]   #31
df = df[~(df['URL'].str.contains('clinic'))]   #2
df = df[~(df['URL'].str.contains('service'))]
df = df[~(df['URL'].str.contains('apps'))]
df = df[~(df['URL'].str.contains('phoneky'))]  #31846
df = df[~(df['URL'].str.contains('iphoneringtones'))] #73
df = df[~(df['URL'].str.contains('leonardoverona.it'))] #1
df = df[~(df['URL'].str.contains('restauranterosildos.com'))] #1
df = df[~(df['URL'].str.contains('www.grosbill.com'))] #7
df = df[~(df['URL'].str.contains('www.thataesportes.com.br/'))] #1
df = df[~(df['URL'].str.contains('www.alltrails.com'))] #27
df = df[~(df['URL'].str.contains('www.cabinet-fredericandco.fr'))] #1
df = df[~(df['URL'].str.contains('hargeysa.org'))] #1
df = df[~(df['URL'].str.contains('insiderpages.com'))]
df = df[~(df['URL'].str.contains('ringtone'))]
print("After removing other related mistakes: ", len(df))


print('######## LINGUISTIC AFFILIATION')
df = cl.remove_non_english(df)
# pf.show_lang_dist(df, 'amazon_phone_lang_non_eng_dist', 'non-English Language Distribution (Amazon Cell Phones & Accessories)', 0)
# pf.show_lang_dist(df, 'amazon_phone_lang_all_dist', 'Language Distribution (Amazon Cell Phones & Accessories)', 1)


print('######## RATING SCALE ADJUSTMENT')

def chForm(value):
    value = re.sub(r'[a-zA-Z]|@|%| |"|-|\\', '', value)
    # return value
    if value != "":
        value = float(value)
    else:
        print('did not work for: ', value)
    return value
df['REVIEWRATING_new'] = df.apply(lambda row: chForm(row['REVIEWRATING']), axis=1)
df['BESTRATING_new']= df.apply(lambda row: chForm(row['BESTRATING']), axis=1)
df['WORSTRATING_new']= df.apply(lambda row: chForm(row['WORSTRATING']), axis=1)

def adjust_rating(value, source_min, source_max, bins):
    if value >= source_max:
        return bins
    if value < source_min:
        return 1
    source_range = source_max - source_min
    bin_size = source_range/bins
    res = None
    for bound in range(bins):
        lower_bound = (bound * bin_size)+source_min
        upper_bound = ((bound+1) * bin_size)+source_min
        if source_min == 0:
            if value > lower_bound and value <= upper_bound:
                res = bound+1
            elif value == source_min:
                res = 1
        else:
            if value >= lower_bound and value < upper_bound:
                res = bound+1
    return res

df['label'] = df.apply(lambda row: adjust_rating(
    row['REVIEWRATING_new'], row['WORSTRATING_new'], row['BESTRATING_new'], 5), axis=1)

print(df['label'].value_counts(normalize=True))
print(df['label'].value_counts())


print('######## PREPROCESSING')
print('######## Web Data Specific')
df = prep.remove_html(df)
df = prep.remove_hyperlinks(df)
df = prep.remove_between_square_brackets(df)
df = prep.remove_between_angle_brackets(df)
df = prep.unescape(df)
df = prep.remove_whitespaces(df)
df = prep.remove_lang_ind(df)

print('######## Text Harmonization')
df = prep.replace_contractions(df)
df = prep.remove_accented_chars(df)
df = prep.to_token(df)
total_token_count = pf.get_total_token_count(df)
print(total_token_count)
df = prep.transform_abbr(df)
df = prep.remove_numbers(df)
df = prep.remove_punct_and_nonascii(df)
df = prep.to_lower(df)
df = prep.remove_stopwords(df)
df = prep.get_pos(df)

print('######## Text Canonicalization')
df = prep.stem(df)

pf.get_prep_summary(df, total_token_count)

df = prep.detokenize(df)

print('######## DATA COMPLETION AFTER PREPROCESSING')
total_prep = len(df)
df_empty = df[df.text_prep=='']
print("Empty preprocessed text:", len(df_empty))
print("Duplicate preprocessed text: {:.2%}".format(len(df_empty) / total_prep))
df = df[df.text_prep!='']
print("After removing empty preprocessed texts: ", len(df))

print('######## DEDUPLICATION AFTER PREPROCESSING')
df_dup = df[df.duplicated(subset=['text_prep'], keep='last')]
print("Duplicate preprocessed text:", len(df_dup))
print("Duplicate preprocessed text: {:.2%}".format(len(df_dup) / total_prep))
df = df.drop_duplicates(subset=['text_prep'], keep='last')
print("After removing duplicate preprocessed texts: ", len(df))


print('######### LAST CHECK')
print(df.head())
print(df.describe(include='all'))


print('######## STORING')
df = df[['NODE','URL','text', 'label', 'text_prep', 'token_count', 'upper', 'pos']]
df.columns=['NODE', 'URL','text', 'label', 'text_prep', 'token_count', 'upper', 'pos']
df.to_pickle('../Data/schema_phone.pkl')
print('to pickle done')

######## FIRST GENERAL INSIGHT
                                     NODE  \
0    _:node89ce3018196554d41bceaeac30a31e   
1   _:nodec8b9fba77ad36d9d11460c11f9151d9   
2  _:node46ac847e2080dc78cf9ae447647ac8d4   
3     _:nodef160bce2dcba31bd02d1a072e6ad2   
4  _:nodef4bf4b94afaabab1bbb342d3ccc284af   

                                                 URL  \
0  https://www.techworld.com/review/smartphones/l...   
1  https://www.technospain.es/fundas-smartphones-...   
2  https://www.technospain.es/fundas-smartphones-...   
3  https://www.technospain.es/fundas-smartphones-...   
4  https://www.technospain.es/fundas-smartphones-...   

                                                text REVIEWRATING BESTRATING  \
0  "The smartphone market is as competitive as ev...         None       None   
1                                               None    "5"@es-es  "5"@es-es   
2                                               None    "5"@es-es  "5"@es-es   
3                                        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mis_rat'] = df.apply(checkmisrat, axis=1)
