In [1]:
import pandas as pd
import numpy as np
import pymongo 
from pymongo import MongoClient
import warnings
warnings.filterwarnings('ignore')

import emoji
import contractions
from langdetect import detect
from deep_translator import GoogleTranslator
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from autocorrect import Speller

In [2]:
def read_data(mongodb_client, database_name, collection_name):
    """ Reading scrapped reviews from MongoDB """
    # Creating list to store customer reviews
    customer_reviews = []

    # Initialize database 
    db = mongodb_client[database_name]
    collection = db[collection_name]

    # Reading collection
    for document in collection.find():
        if len(document) > 5: # Indicating that all the required keys are present in the dictionary
            customer_reviews.append(document)
        continue

    return pd.DataFrame(customer_reviews)

In [3]:
# Reading documents from MongoDB
total_df = read_data(
    mongodb_client = MongoClient("mongodb://localhost:27017/"), 
    database_name = 'TrustPilotDatabase', collection_name = 'ReviewCollection'
)

# Filtering reviews on the basis of attributes which were found 
reviews_df = total_df[
    (total_df['Username']!='NOT FOUND') & (total_df['Location']!='NOT FOUND') & 
    (total_df['Review']!='NOT FOUND') & (total_df['Rating']!='NOT FOUND') &
    (total_df['Title']!='NOT FOUND')
].reset_index(drop=True)[['Review','Rating']]

In [4]:
# Function to write dataframe to mongodb 
def write_dataframe(df, databaseName, collectionName):
    """ Function for storing dataframe into MongoDB Collection """
    
    # Initializing database and collection
    mongo_client = MongoClient("mongodb://localhost:27017/")
    mydb = mongo_client[databaseName]
    mycol = mydb[collectionName]
    
    # Creating a list to store records
    list_records = []

    # Creating record dictionary
    for i in range(len(df)):
        record_dict = {}
        for column in df.columns:
            record_dict[column] = df.iloc[i][column]
        list_records.append(record_dict)
    x = mycol.insert_many(list_records)
    return 'Completed'

# Data Pre-Processing

##### Replacing ratings from 1-5 to Positive & Negative

In [5]:
def split_rating_string(input_string):
    """ Function to split input string into integer ratings """
    if int(input_string.split(' ')[1]) in [1,2,3]:
        return 'Negative'
    return 'Positive'

reviews_df['NewRating'] = reviews_df['Rating'].map(lambda x: split_rating_string(x), na_action='ignore')

##### Language Detection & Translation

In [7]:
# Using langdetect to detect language of input review
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'CheckEN'

review_list = reviews_df['Review'].to_list()
reviews_df['Language'] = list(map(detect_language, review_list))
write_dataframe(reviews_df, databaseName = "TrustPilotDatabase", collectionName = "LangDetectCollection")

'Completed'

In [8]:
# Isolating english language reviews
english_df = reviews_df[reviews_df['Language'] == "en"].reset_index(drop=True)

# CheckEN language code will be ignored & remaining language codes will be translated
non_english_df = reviews_df[(reviews_df['Language'] != "CheckEN") & (reviews_df['Language'] != "en")].reset_index(drop=True)

In [9]:
# Using deep_translator to perform language translation
def language_translation(text,translator_instance):
    """ Translating Reviews from different languages to English using Deep Translator """
    translated_review = ''
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        try:
            translated_sentence = translator_instance.translate(sentence)
            translated_review += translated_sentence
        except:
            translated_review += sentence
        translated_review += " "
    return translated_review.strip()

In [10]:
# Instanciating Google Translator
translator_instance = GoogleTranslator(source='auto', target='en')
non_english_df['Translated_Reviews'] = non_english_df['Review'].apply(lambda x: language_translation(x,translator_instance))

# Writing results to MongoDB
write_dataframe(non_english_df, databaseName = "TrustPilotDatabase", collectionName = "NonEnglishTranslatedCollection")

'Completed'

In [12]:
# Concatenating results under 1 dataframe
translated_reviews = pd.concat(
    [
        english_df, 
        non_english_df[['Translated_Reviews','Rating','Language']].rename(
            columns = {'Translated_Reviews':'Review'}
        ) 
    ],axis=0).reset_index(drop=True)

# Writing results to MongoDB
write_dataframe(translated_reviews, databaseName = "TrustPilotDatabase", collectionName = "TranslatedCollection")

'Completed'

##### Handling Emoticons

In [13]:
def encoding_emoticons(input_text):
    """ Replaces emoticons with word tokens, to capture essence of emoji """
    return emoji.demojize(input_text)

# Applying function
translated_reviews['EmojiEncoded'] = list(map(encoding_emoticons , translated_reviews['Review']))

# Writing results to MongoDB
write_dataframe(translated_reviews, databaseName = "TrustPilotDatabase", collectionName = "EmoticonEncodedCollection")

'Completed'

##### Expanding contractions

In [15]:
def fix_contractions(input_text):
    new_text = ""
    for sentence in sent_tokenize(input_text):
        new_text += contractions.fix(sentence)
        new_text += ' '
    return new_text.strip()

# Fixing contractions 
translated_reviews['FixedContraction'] = list(map(encoding_emoticons , translated_reviews['EmojiEncoded']))

##### Spelling Correction

In [16]:
def autospell(text):
    spells = [spell(w) for w in (word_tokenize(text))]
    return " ".join(spells) 

# Fixing spelling errors in input reviews
spell = Speller(lang='en')
translated_reviews['SpellingCorrected'] = list(map(encoding_emoticons , translated_reviews['FixedContraction']))
write_dataframe(translated_reviews, databaseName = "TrustPilotDatabase", collectionName = "SpellingCorrectedCollection")

'Completed'

##### Storing processed reviews

In [27]:
# Filling up NAN values in Ratings
translated_reviews['NewRating'] = translated_reviews['Rating'].map(lambda x: split_rating_string(x), na_action='ignore')
write_dataframe(translated_reviews, databaseName = "TrustPilotDatabase", collectionName = "ProcessedReviewCollection")

'Completed'