In [1]:
from Scripts import loading as dl, profiling as pf, cleaning as cl, preprocessing as prep


input_link = '../Data/reviews_Cell_Phones_and_Accessories_5.json.gz'
df = dl.load_amazon_full(input_link)


print('######## FIRST GENERAL INSIGHT')
print(df.head())
pf.get_review_count(df)
pf.get_descr(df)


print('######## DATA COMPLETENESS')
pf.get_missing_text(df)
pf.get_missing_label(df)
df = cl.proceed_data_completion(df)


print('######## DUPLICATE DETECTION')
pf.get_duplicates(df)
df = cl.drop_duplicates(df)


print('######## LINGUISTIC AFFILIATION')
df = cl.remove_non_english(df)
# pf.show_lang_dist(df, 'amazon_movie_lang_non_eng_dist', 'non-English Language Distribution (Amazon Movies & TV)', 0)
# pf.show_lang_dist(df, 'amazon_movie_lang_all_dist', 'Language Distribution (Amazon Movies & TV)', 1)


print('######## OTHER HEURISTICS')
df = df[~df.text.str.contains(r'^&#((15|16|20)[0-9]{2,3});*')]
print('After removing from other cleaning heuristics: ', len(df))


print('######## PREPROCESSING')
print('######## Web Data Specific')
df = prep.remove_html(df)
df = prep.remove_hyperlinks(df)
df = prep.remove_between_square_brackets(df)
df = prep.remove_between_angle_brackets(df)
df = prep.unescape(df)
df = prep.remove_whitespaces(df)
df = prep.remove_lang_ind(df)

print('######## Text Harmonization')
df = prep.replace_contractions(df)
df = prep.remove_accented_chars(df)
df = prep.to_token(df)
total_token_count = pf.get_total_token_count(df)
print(total_token_count)
df = prep.transform_abbr(df)
df = prep.remove_numbers(df)
df = prep.remove_punct_and_nonascii(df)
df = prep.to_lower(df)
df = prep.remove_stopwords(df)
df = prep.get_pos(df)

print('######## Text Canonicalization')
df = prep.stem(df)

pf.get_prep_summary(df, total_token_count)

df = prep.detokenize(df)

print('######## DATA COMPLETION AFTER PREPROCESSING')
total_prep = len(df)
df_empty = df[df.text_prep=='']
print("Empty preprocessed text:", len(df_empty))
print("Duplicate preprocessed text: {:.2%}".format(len(df_empty) / total_prep))
df = df[df.text_prep!='']
print("After removing empty preprocessed texts: ", len(df))

print('######## DEDUPLICATION AFTER PREPROCESSING')
df_dup = df[df.duplicated(subset=['text_prep'], keep='last')]
print("Duplicate preprocessed text:", len(df_dup))
print("Duplicate preprocessed text: {:.2%}".format(len(df_dup) / total_prep))
df = df.drop_duplicates(subset=['text_prep'], keep='last')
print("After removing duplicate preprocessed texts: ", len(df))


print('######### LAST CHECK')
print(df.head())
print(df.describe(include='all'))


print('######## STORING')
df = df[['text','label', 'text_prep', 'token_count', 'upper', 'pos']]
df.columns=['text', 'label', 'text_prep', 'token_count', 'upper', 'pos']
df.to_pickle('../Data/amazon_phone.pkl')
print('to pickle done')




######## FIRST GENERAL INSIGHT
                                                text  label
0  They look good and stick good! I just don't li...    4.0
1  These stickers work like the review says they ...    5.0
2  These are awesome and make my phone look so st...    5.0
3  Item arrived in great time and was in perfect ...    4.0
4  awesome! stays on, and looks great. can be use...    5.0
######## Total: 
Amount of reviews:  194439
######## DESCRIPTION
          text          label
count   194439  194439.000000
unique  194186            NaN
top                       NaN
freq        99            NaN
mean       NaN       4.129912
std        NaN       1.222499
min        NaN       1.000000
25%        NaN       4.000000
50%        NaN       5.000000
75%        NaN       5.000000
max        NaN       5.000000
######## DATA COMPLETENESS
Missing/Empty review text: 99
Missing review text as percentage: 0.05% 
Missing rating information 0
Missing rating information as percentage: 0.00%
After re

  return func(self, *args, **kwargs)
