In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import pymongo 
from pymongo import MongoClient
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import mlflow
from mlflow import log_metric, log_param, log_artifacts
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, brier_score_loss, log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import os, re
import warnings
warnings.filterwarnings('ignore')
stop_words = set(stopwords.words('english'))

In [2]:
def read_data(mongodb_client, database_name, collection_name):
    """ Reading scrapped reviews from MongoDB """
    # Creating list to store customer reviews
    customer_reviews = []

    # Initialize database 
    db = mongodb_client[database_name]
    collection = db[collection_name]

    # Reading collection
    for document in collection.find():
        if len(document) > 5: # Indicating that all the required keys are present in the dictionary
            customer_reviews.append(document)
        continue

    return pd.DataFrame(customer_reviews)

In [3]:
# Reading processed reviews from MongoDB
total_df = read_data(
    mongodb_client = MongoClient("mongodb://localhost:27017/"), 
    database_name = 'TrustPilotDatabase', collection_name = 'ProcessedReviewCollection'
)

# Renaming column names
df = total_df[['SpellingCorrected','Rating']]
required_df = df.rename(columns = {'SpellingCorrected' : 'Reviews'}).reset_index(drop=True)

# Filtering 'None' & 'N/A' reviews from the corpus
required_df = required_df[(required_df['Reviews']!= 'None') & (required_df['Reviews']!= 'N/A')].reset_index(drop=True)

In [4]:
# Extracting Class Label from Rating
def split_rating(input_rating):
    ''' Function to split an input rating & store it's integer class label '''
    return int(input_rating.split()[1])

# Removing extra whitespaces within reviews
def remove_whitespaces(text):
    """ Removing additional whitespaces"""
    return ' '.join(sent_tokenize(text.strip()))

required_df['Reviews'] = required_df['Reviews'].apply(lambda x : remove_whitespaces(x))
required_df['Rating'] = required_df['Rating'].apply(lambda x : split_rating(x))

### Machine Learning Modelling
In this section, we will be performing a `train-test split` on the review dataset, in order to evaluate the fit of our machine learning models.
Additionally, I will be using MLFlow as an experiment tracker. `MLflow` is a platform to streamline `machine learning development`, including `tracking experiments`, packaging code into reproducible runs, and `sharing and deploying models`. 

We will begin by assigning a `local server` with a suitable local URL: `http://127.0.0.1:5000` (in my case, but this is configurable as per the user's requirements). We will then create an` experiment` which will be visible to the developer, using the interactive MLFlow UI. 
Within, this experiment, we will be creating multiple MLFlow runs for `tracking, storing & visualizing machine learning artifacts, models & evaluation metrics`

In [5]:
# Setting up MLFlow Server for experiment tracking 
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Model Building - Without Augmentation")

<Experiment: artifact_location='mlflow-artifacts:/926171415805343055', creation_time=1720605883658, experiment_id='926171415805343055', last_update_time=1720605883658, lifecycle_stage='active', name='Model Building - Without Augmentation', tags={}>

#### MLOPS - Logging dataset artifacts

In [6]:
def logging_datasets(data_dict, destination_folder_name):
    '''
    Functions for logging & storing features & target labels of training & test datasets as ML Artifacts.
    Input : 
        @data_dict : Dictionary which stores the name & their respective dataframe as a key value pair
        @destination_folder_name : Name of the folder where the datasets will be stored. (Run ID must specified, optionally)
    '''
    # Creating MLflow run & logging artifact to the MLflow run
    active_directory = r"C:\Users\smathur\Desktop\Projects\TrustPilot"
    with mlflow.start_run():
        print("========= Initializing logging =========")
        # Iterating over data_dict to store each dataframe as an artifact
        for key, value in data_dict.items():
            print(f'Logging {key} as an artifact...')
            value.to_csv(key, index=False)
            mlflow.log_artifact(key, destination_folder_name)

            # Deleting the file from the current working directory 
            os.remove(os.path.join(active_directory, key))
            print('Current working directory cleanup completed. Logging successful')
        print("========= Artifact logging completed =========")

In [7]:
# Splitting reviews into train & test sets
X, y = required_df[['Reviews']], required_df[['Rating']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, shuffle = True)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Creating a dictionary to store filenames & their respective dataframe as key value pairs
data_info = {
    'X_train.csv' : X_train, 
    'X_test.csv' : X_test, 
    'y_train.csv' : y_train, 
    'y_test.csv' : y_test
}

# Creating MLflow run & logging artifact to the MLflow run.
#logging_datasets(data_dict = data_info, destination_folder_name= 'WithoutAugmentation')

#### Pre-Processing for Bag of Words

In [8]:
# Creating vocabulary
def create_vocabulary(review_list):
    '''
    Function to create the vocabulary from an input list of reviews
    Input:
        @review_list : List of input reviews, used to create the vocabulary
    Output:
        vocabulary : Resulting vocabulary of the review sentences
    '''
    vocabulary = []
    for review in review_list:
        words = word_tokenize(review)
        for word in words:
            vocabulary.append(word)
    return vocabulary

# Building training vocabulary 
train_reviews = X_train['Reviews'].tolist()
train_vocab = create_vocabulary(train_reviews)
print(f'Number of tokens in training vocabulary : {len(train_vocab)}')
print(f'Number of unique tokens in training vocabulary : {len(set(train_vocab))}')

Number of tokens in training vocabulary : 1509350
Number of unique tokens in training vocabulary : 35255


In [9]:
# Lemmatization & Stop Word removal
def perform_lemmatization_stop_word_removal(input_review, lemmatizer, stop_words):
    ''' Function to perform lemmatization & removing stop words from the input review '''
    words = word_tokenize(input_review)
    final_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(final_words).strip()

# Performing lower casing
def lower_case(input_review):
    ''' Function to convert an input review into lower casing '''
    return input_review.lower()

# Removing Punctuation
def remove_punctuation(input_review):
    ''' Removing punctuation marks from reviews'''
    # Define the pattern to match punctuation characters
    pattern = r"[^\w\s]"
    # Remove punctuations using regex substitution
    text_without_punctuation = re.sub(pattern, "", input_review)

    # Removing additional whitespaces 
    text_words = text_without_punctuation.split()
    return ' '.join(text_words).strip()

In [10]:
# Instanciating lemmatizer for word normalization
lemmatizer = WordNetLemmatizer()
processed_sentences = []

# Pre-Processing train_sentences 
for sentence in train_reviews:
    lemmatized_sentence = perform_lemmatization_stop_word_removal(sentence,lemmatizer, stop_words)
    removed_punctuations = remove_punctuation(lemmatized_sentence)
    lower_cased = lower_case(removed_punctuations)
    processed_sentences.append(lower_cased)

# Building vocabulary of pre-processed training reviews
processed_train_vocab = create_vocabulary(processed_sentences)
print(f'Number of tokens in new vocabulary : {len(processed_train_vocab)}')
print(f'Number of unique tokens in new vocabulary : {len(set(processed_train_vocab))}')

Number of tokens in new vocabulary : 791077
Number of unique tokens in new vocabulary : 24739


#### Model Building: Bag-of-Words

In [11]:
## Creating a Bag-of-Words model
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,1))
bow_X_train = vectorizer.fit_transform(processed_sentences)

# Converting to array
transformed_bow_X_train = bow_X_train.toarray()
train_labels = np.array(y_train['Rating']).reshape(-1,1)

In [12]:
# Applying Pre-Processing steps to test set
test_reviews = X_test['Reviews'].tolist()
processed_test_sentences = []

# Pre-Processing train_sentences 
for sentence in test_reviews:
    lemmatized_sentence = perform_lemmatization_stop_word_removal(sentence,lemmatizer, stop_words)
    removed_punctuations = remove_punctuation(lemmatized_sentence)
    lower_cased = lower_case(removed_punctuations)
    processed_test_sentences.append(lower_cased)

# Transforming pre-processed test reviews
bow_X_test = vectorizer.transform(processed_test_sentences)

# Converting to array
transformed_bow_X_test = bow_X_test.toarray()
test_labels = np.array(y_test['Rating']).reshape(-1,1)

In [13]:
# Evaluating fit of the model
def evaluate_model(y_true , y_pred):
    ''' 
    Function to calculate various evaluation metrics for evaluating fit of the model
    Input:
        @y_true : Truth labels
        @y_pred : Predicted labels, which is the result of a machine learning algorithm
    Output:
        @eval_metrics : A dictionary containing evaluation metrics for the generated predictions
    '''
    
    print('Calculating global evaluation metrics...')
    CLASS_LABELS = ['1', '2','3','4','5']
    METRICS = ['precision', 'recall', 'f1-score'] 
    
    eval_metrics = {
        'Overall Accuracy' : np.round(accuracy_score(y_true, y_pred), 4) ,
        'Overall Precision' : np.round(precision_score(y_true, y_pred,average='macro'), 5),
        'Overall Recall' : np.round(recall_score(y_true, y_pred,average='macro'), 5),
        'Overall F1 Score' : np.round(f1_score(y_true, y_pred,average='macro'), 5),
    }
    print('Calculating evaluation metrics by class label')
    report = classification_report(y_true, y_pred, output_dict=True)
    for class_key in report.keys():
        if class_key in class_labels:
            for metric in report[class_key].keys():
                if metric in METRICS:
                    key = str(class_key)+'-' + metric.capitalize()
                    eval_metrics[key] = np.round(report[class_key][metric],5)
    print('Evaluation metrics calculated!')
    return eval_metrics

In [20]:
# Fitting scikit learn model 
def fit_sklearn_model(model_name ,model_instance , X_train, y_train , X_test, y_test, base_run_name):
    ''' 
    Utility function which accepts a machine learning algorithm as input, fits the algorithm to the training data, generates predictions,
    calculates evaluation metrics using MLFlow runs for tracking, logging & visualizing.
    
    Input:
        @model_name : Name of machine learning model
        @model_instance : Instance of machine learning model
        @X_train : Train feature matrix
        @y_train : Training class labels
        @X_test : Test feature matrix
        @y_test : Test class labels
        @base_run_name : Base name assigned to each MLFlow run
    '''
    print(f'########## {model_name} ##########')
    assigned_run_name = base_run_name + '-' + model_name
    with mlflow.start_run(run_name = assigned_run_name):
        print('Fitting on training data....')
        model_instance.fit(X_train, y_train)
        print('Fitting successful...Generating predictions...')
        model_preds = model_instance.predict(X_test)
        print('Predictions generated.')
        eval_metrics = evaluate_model(y_true = test_labels , y_pred = logit_preds)
        print('Logging evaluation metrics...')
        for key , value in eval_metrics.items():
            mlflow.log_metric(key, value)
        print('Logging completed. Logging model...')
        signature = infer_signature(
            X_train, 
            model_instance.predict(X_train)
        )
        mlflow.sklearn.log_model(
            model_instance, 
            model_name, 
            signature=signature
        )
        print('Model logging completed. Experiment successful.\n')

In [15]:
# fit_sklearn_model(
#     model_name = "Logistic Regression",
#     model_instance = LogisticRegression(),
#     X_train = transformed_bow_X_train,
#     y_train = train_labels,
#     X_test = transformed_bow_X_test,
#     y_test = test_labels,
#     base_run_name = "BOW"
# )

In [26]:
# Doc2Vec
# GloVe

#### Pre-Processing for TF-IDF

In [22]:
# Pre-Processing required for creating TF-IDF matrix differs slightly than the pre-processing required for creating a bag-of-words model. 
# We will not be removing stop words or punctuations from the training reviews. This is due to the fact that TF-IDF matrix will assign less weight 
# to frequently occuring tokens & more weight to uniquely occuring tokens.

# Lemmatization & Stop Word removal
def perform_lemmatization(input_review, lemmatizer, stop_words):
    ''' Function to perform lemmatization only on the input review '''
    words = word_tokenize(input_review)
    final_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(final_words).strip()

lemmatizer = WordNetLemmatizer()
tf_idf_processed_train_sentences = []

# Pre-Processing train_sentences for TF-IDF 
for sentence in train_reviews:
    lemmatized_sentence = perform_lemmatization(sentence,lemmatizer, stop_words) # Lemmatization 
    lower_cased = lower_case(lemmatized_sentence) # Lower Casing
    tf_idf_processed_train_sentences.append(lower_cased)

# Building vocabulary of TF-IDF pre-processed training reviews
processed_tfidf_train_vocab = create_vocabulary(tf_idf_processed_train_sentences)
print(f'Number of tokens in new vocabulary : {len(processed_tfidf_train_vocab)}')
print(f'Number of unique tokens in new vocabulary : {len(set(processed_tfidf_train_vocab))}')


## Creating a TF-IDF model & transforming processed tf-idf train reviews
tf_idf_vectorizer = TfidfVectorizer()
tfidf_X_train = tf_idf_vectorizer.fit_transform(tf_idf_processed_train_sentences)
transformed_tfidf_X_train = tfidf_X_train.toarray() # Converting to array

Number of tokens in new vocabulary : 1509729
Number of unique tokens in new vocabulary : 26427


In [23]:
## Pre-Processing Test reviews & converting to TF-IDF Matrix
processed_test_sentences = []
for sentence in test_reviews:
    lemmatized_sentence = perform_lemmatization(sentence,lemmatizer, stop_words) # Lemmatization 
    lower_cased = lower_case(lemmatized_sentence) # Lower Casing
    processed_test_sentences.append(lower_cased)

# Transforming pre-processed test reviews
tfidf_X_test = tf_idf_vectorizer.transform(processed_test_sentences)
transformed_tfidf_X_test = tfidf_X_test.toarray()

#### Model Building: Term Frequency - Inverse Document Frequency

In [25]:
# fit_sklearn_model(
#     model_name = "Logistic Regression",
#     model_instance = LogisticRegression(),
#     X_train = transformed_tfidf_X_train,
#     y_train = train_labels,
#     X_test = transformed_tfidf_X_test,
#     y_test = test_labels,
#     base_run_name = "TFIDF"
# )

#### Pre-Processing for GloVE Embeddings