#### ***Example of usage of Distilbert sentiment analysis model***

In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

text = "This product works amazingly well!"
result = classifier(text)
print(result)

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9998623132705688}]


#### ***Importing the libraries***

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from tqdm import tqdm
import gc
import logging
from typing import List
import numpy as np
import pymongo
import pandas as pd
import math

##### ***Logger***

In [3]:
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
class SentimentAnalyzer:
    def __init__(self, model_name, batch_size = 128, max_length = 128):
        self.batch_size = batch_size
        self.max_length = max_length
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        logger.info(f"Using device: {self.device}")
        
        #Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)

        if self.device == "cuda":
            self.model = self.model.half()
        
        self.model.eval()

    def _batch_tokenize(self, texts: List[str]):
        return self.tokenizer(texts, max_length = self.max_length, padding = True, truncation = True, return_tensors = "pt")

    def process_reviews(self, df, output_file, chunk_size):
        total_rows = len(df)
        results = []
        
        #Process in chunks
        for start_idx in tqdm(range(0, total_rows, chunk_size)):
            end_idx = min(start_idx + chunk_size, total_rows)
            chunk_df = df.iloc[start_idx : end_idx]
            
            #Process each batch within the chunk
            for batch_start in range(0, len(chunk_df), self.batch_size):
                batch_end = min(batch_start + self.batch_size, len(chunk_df))
                batch_df = chunk_df.iloc[batch_start : batch_end]

                #Here, we are retreiving text and review"s id to put them later on file
                texts = batch_df["text"].tolist()
                review_ids = batch_df.index.tolist()
                
                #Tokenize
                encoded = self._batch_tokenize(texts)
                input_ids = encoded["input_ids"].to(self.device)  # Keep as LongTensor
                attention_mask = encoded["attention_mask"].to(self.device)
                
                #Get predictions
                with torch.no_grad():
                    if self.device == "cuda":
                        #Only convert attention_mask to half precision!
                        attention_mask = attention_mask.half()

                    outputs = self.model(input_ids, attention_mask = attention_mask)
                    probs = torch.nn.functional.softmax(outputs.logits, dim = -1)
                    batch_preds = (probs[:, 1] > 0.5).cpu().numpy()
                    batch_confs = probs.max(dim = 1)[0].cpu().numpy()
                
                #Store results
                batch_results = pd.DataFrame({
                    "review_id" : review_ids,
                    "sentiment" : ["positive" if pred else "negative" for pred in batch_preds],
                    "confidence" : batch_confs
                })
                results.append(batch_results)
                
                #Clear GPU memory
                if self.device == "cuda":
                    torch.cuda.empty_cache()
            
            #Save chunk results
            pd.concat(results).to_csv(output_file, mode = "a", header = not bool(start_idx), index = False)
            results = []
            gc.collect() 

#### ***Connect to MongoDB and load the reviews***

In [5]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["yelp"]

reviews = db["reviews"].find({}, {
    "review_id" : 1,
    "text" : 1
})

reviews = pd.DataFrame(reviews)
reviews = reviews.set_index("review_id")
reviews

Unnamed: 0_level_0,_id,text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
KU_O5udG6zpxOg-VcAEodg,675167d1b78f0be484408abb,"If you decide to eat here, just be aware it is..."
BiTunyQ73aT9WBnpR9DZGw,675167d1b78f0be484408abc,I've taken a lot of spin classes over the year...
saUsX_uimxRlCVr67Z4Jig,675167d1b78f0be484408abd,Family diner. Had the buffet. Eclectic assortm...
AqPFMleE6RsU23_auESxiA,675167d1b78f0be484408abe,"Wow! Yummy, different, delicious. Our favo..."
Sx8TMOWLNuJBWer-0pcmoA,675167d1b78f0be484408abf,Cute interior and owner (?) gave us tour of up...
...,...,...
H0RIamZu0B0Ei0P4aeh3sQ,67516883b78f0be484ab345d,Latest addition to services from ICCU is Apple...
shTPgbgdwTHSuU67mGCmZQ,67516883b78f0be484ab345e,"This spot offers a great, affordable east week..."
YNfNhgZlaaCO5Q_YJR4rEw,67516883b78f0be484ab345f,This Home Depot won me over when I needed to g...
i-I4ZOhoX70Nw5H0FwrQUA,67516883b78f0be484ab3460,For when I'm feeling like ignoring my calorie-...


##### ***Make predictions using the model***

In [None]:
analyzer = SentimentAnalyzer("distilbert-base-uncased-finetuned-sst-2-english", batch_size = 512, max_length = 512)

analyzer.process_reviews(reviews, "sentiment_results_with_neutral.csv", 10000)

INFO:__main__:Using device: cuda
  0%|          | 0/700 [00:00<?, ?it/s]

In [None]:
#If you want to retreive the sentiments from file, you need to execute this cell
sentiment = pd.read_csv("sentiment_results.csv")
sentiment = sentiment.set_index("review_id")
sentiment_dict = sentiment.to_dict(orient = "index")
sentiment_dict

##### ***Loading the results into MongoDB (```reviews``` collection)***

In [None]:
#We'll use batches to boost updating process
batch_size = 10000

sentiment_list = list(sentiment_dict.items())
num_batches = math.ceil(len(sentiment_list) / batch_size)

with tqdm(total = num_batches) as pbar:

    for i in range(0, len(sentiment_list), batch_size):
        batch_items = sentiment_list[i : i + batch_size]
        
        #To update efficently all the documents, we'll use "bulk_write" that is able to minimize the number of db operations
        db["reviews"].bulk_write([
                                    pymongo.UpdateOne({
                                                          "review_id" : review_id
                                                      },
                                                      {
                                                          "$set" : data
                                                      })
                                                      for review_id, data in batch_items
                                ])
        
        pbar.update(1)

##### ***Loading the results into MongoDB (```businesses_merged``` collection)***

In [None]:
review_to_business_map = db["reviews"].find({},
                                            {   
                                                "_id" : 0,
                                                "review_id" : 1,
                                                "business_id" : 1
                                            })

review_to_business_map = pd.DataFrame(review_to_business_map).set_index("review_id")["business_id"].to_dict()

In [None]:
#To speed up update process, we'll create atemporary index on "reviews.review_id". It is unique and "sparse" because some documents might not have "reviews" subcollection
db["businesses_merged"].create_index(["reviews.review_id"], unique = True, sparse = True)

In [None]:
batch_size = 2500

sentiment_list = list(sentiment_dict.items())
num_batches = math.ceil(len(sentiment_list) / batch_size)

with tqdm(total = num_batches) as pbar:

    for i in range(0, len(sentiment_list), batch_size):
        batch_items = sentiment_list[i : i + batch_size]
        
        #To update efficently all the documents, we'll use "bulk_write" that is able to minimize the number of db operations
        db["businesses_merged"].bulk_write([
                                                  pymongo.UpdateOne({
                                                                        #First finnd the document using "business_id", then find the element in the embedded array using "review_id"  
                                                                        "business_id" : review_to_business_map[review_id],
                                                                        "reviews.review_id" : review_id 
                                                                    },
                                                                    {
                                                                        "$set" : {
                                                                            #$ is used to update only the matched document(s)
                                                                            "reviews.$.sentiment" : data["sentiment"],
                                                                            "reviews.$.confidence" : data["confidence"]  
                                                                        }
                                                                    })
                                                                    for review_id, data in batch_items
                                          ])
        
        pbar.update(1)