# Data Cleaning

## Load the data

In [6]:
from toolbox import ToolBox

In [7]:
tb = ToolBox()

In [92]:
df = tb.load_data_sql(table='user_reviews', where="lang = 'en'", use_cache=True)
df = df[~df['review'].isnull()]
df.head()

Loaded from cache


Unnamed: 0,game,username,date,grade,review,lang,helpful_nb,helpful_nb_total
0,007-legends-pc,Bughyman1000,2013-01-19,30,"Oh, my dear God! What is it with developers th...",en,0,0
1,007-legends-pc,DiabloZiri,2014-06-23,30,Seriously if you want to have a HUGE laugh wit...,en,0,0
2,007-legends-pc,DrugsMeazureTim,2014-12-11,0,godawful port of the xbox 360 version and ps3....,en,0,0
3,007-legends-pc,DustEater,2012-11-03,0,Agree. Worst game ever. Its a full copy of Cal...,en,8,8
4,007-legends-pc,evry1isacritic,2012-11-04,0,DO NOT BUY THIS GAME for the PC (or for any ot...,en,2,2


In [56]:
print('Amount of reviews in English: ', df.shape[0])

Amount of reviews in English:  211773


## Pre-processing
* Removing invalid records: docs with less than 200 characters.
* Removing invalid records: docts that do not correspond to our sample.
* Cleaning: special characters, numbers, emojis, URLs, email adresses, words in other languages, words with less than three characters.
* Stopwords
* Normalization
* Tokenize
* Sparse terms

### Count characters of review

In [28]:
def add_character_count_to_df(dataframe):
    '''
    Add extra column with amount of characters.
    '''
    
    new_df = dataframe
    new_df['characters'] = df['review'].str.len()
    return new_df

new_df = add_character_count_to_df(df)

### Remove special characters
* Emojis
* `<U+0080>` ==> Impact of 0.001 on 2/46 reviews, but nothing on the compound.

### Filter reviews with X amount of characters

In [None]:
def review_with_x_amount_characters(dataframe, column, maximum, minimum=0):
    bins = [minimum, maximum]
    temporary_df = dataframe
    temporary_df[column + '_character_count'] = temporary_df[column].str.len()
    temporary_df = temporary_df[(temporary_df[column + '_character_count'] > minimum) & (temporary_df[column + '_character_count'] < maximum)]
    
    return temporary_df

In [None]:
my_new_df = review_with_x_amount_characters(df, 'review', minimum=0, maximum=50)
check = [review for review in my_new_df['review']]
check

In [104]:
my_new_df.shape

(38593, 9)