In [1]:
from Scripts import loading as dl, cleaning as cl, profiling as pf
from langdetect import detect

In [2]:
input_link = '../Data/reviews_Movies_and_TV_5.json.gz'
df = dl.load_amazon_full(input_link)

In [20]:
df.head()

Unnamed: 0,text,label,LANGUAGE
0,This is a charming version of the classic Dick...,4.0,en
1,It was good but not as emotionally moving as t...,3.0,en
2,"Don't get me wrong, Winkler is a wonderful cha...",3.0,en
3,Henry Winkler is very good in this twist on th...,5.0,en
4,This is one of the best Scrooge movies out. H...,4.0,en


In [4]:
pf.get_review_count(df)
total = len(df)
df.describe(include='all')

Amount of reviews:  1697533


Unnamed: 0,text,label
count,1697533.0,1697533.0
unique,1696352.0,
top,,
freq,62.0,
mean,,4.110648
std,,1.197615
min,,1.0
25%,,4.0
50%,,5.0
75%,,5.0


In [5]:
df_no_text = df[df.label.isnull()]
print(len(df_no_text))

0


In [6]:
# DATA COMPLETENESS
# Del incomplete entries based on missing values
print('Missing review data as percentage: {:.2%} '.format(len(df[df.text.isnull()])/total))
print('Missing rating information as percentage: {:.2%}'.format(len(df[df.text.isnull()&df.label.isnull()])/total))
df = df[df.text.notnull()&df.label.notnull()]
print("After removing missing text and ratings: ", len(df))

Missing review data as percentage: 0.00% 
Missing rating information as percentage: 0.00%
After removing missing text and ratings:  1697533


In [7]:
# Del incomplete entries based on implictly missing values
df = df[df.text!='']
print("After removing empty text: ", len(df))

After removing empty text:  1697471


In [8]:
# Using Language detection for detecting invalid texts
def detectLang(row):
    try:
        return detect(row['text'])
    except:
        print("exception:", row['text'])
    else:
        print("sad:", row['text'])
df['LANGUAGE'] = df.apply(detectLang, axis=1)
df['LANGUAGE'].value_counts()

exception: :)
exception: ********* **** ********* *** ***** ****** ***** ****** ****** ******* ********** *********** ****** ****** *****  ***** **** **** ****** ****** *******
exception: ........ ...... .. . .... ... ...... ..... ... .... .... ..... .. .. .. .. .. .. .. .. .. ... .. .. .. .. .. .. .. . ... .. ... .... ... ... . ..... ... ... ... ... ... .


en    1693575
de        778
es        706
af        577
fr        242
ro        197
so        155
cy        129
ca        121
it        110
no        109
pt        100
sl         93
sk         91
da         68
nl         57
tl         57
et         48
vi         45
id         39
pl         38
sq         28
hr         24
fi         14
tr         14
hu         12
sv         11
sw         10
cs          8
lt          6
lv          6
Name: LANGUAGE, dtype: int64

In [16]:
df_dup = df[df.duplicated(subset=['text','label'], keep='last')]
df_dup2 = df[df.duplicated(subset=['text'], keep='last')]

In [17]:
# Should be zero after the deleting duplicates
pf.create_word_count(df_dup)
df_dup['word_count'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: word_count, dtype: object

In [13]:
pf.create_word_count(df_dup2)
df_dup2['word_count'].describe()

count    1120.000000
mean       41.163393
std       118.620139
min         1.000000
25%         2.000000
50%         2.000000
75%         3.000000
max      2210.000000
Name: word_count, dtype: float64

In [14]:
print("Duplicate text and label from reviews without missing information: {:.2%}".format(len(df_dup)/ len(df)))
print("Duplicate text: {:.2%}".format(len(df_dup2)/ len(df)))

Duplicate text and label from reviews without missing information: 0.06%
Duplicate text: 0.07%


In [15]:
df= df.drop_duplicates(subset=['text','label'], keep='last')
df = df.drop_duplicates(subset=['text'], keep='last')
print("After removing duplicate entries and texts: ", len(df))

After removing duplicate entries and texts:  1696351


In [19]:
# Del what is not recognized as english
df = df[df['LANGUAGE'] == "en"]
print("After removing non-english text:", len(df))

After removing non-english text: 1693041


In [18]:
# Other heuristics: testing the exceptions from langdetect
# Test on invalid
# df[df.text.str.contains('&#1575;')]   # -> del 6536
# df[df.text.str.contains('&#1576;')]   # 6536
# df[df.text=='#2']
# df[df.text.str.contains('&#1588')]    # 6536
# df[df.text.str.contains('&#20294;')]
# df[df.text.str.contains('&#1603;')]   # 6536
# df=df[~df.text.str.contains('&#1575;')]
# len(df)

Unnamed: 0,text,label,LANGUAGE


In [21]:
# Finally keeping only the wanted properties
df = df[['text', 'label']]

In [22]:
df.to_pickle('../Data/amazon_movie.pkl')


