#### ***Example of usage of Distilbert sentiment analysis model***

In [5]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

text = "This product works amazingly well!"
result = classifier(text)
print(result)

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9998623132705688}]


#### ***Importing the libraries***

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from tqdm import tqdm
import gc
import logging
from typing import List
import numpy as np
import pymongo
import pandas as pd
import math

##### ***Logger***

In [7]:
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

##### ***Model usage***

In [4]:
class SentimentAnalyzer:
    def __init__(self, model_name, batch_size = 128, max_length = 128):
        """
        Initialize the sentiment analyzer with a pre-trained model
        
        Args:
            model_name (str): Name/path of the pre-trained model to load
            batch_size (int): Number of texts to process at once
            max_length (int): Maximum length of input sequences
        """
        self.batch_size = batch_size
        self.max_length = max_length
        #Determine if GPU is available, otherwise use CPU
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        logger.info(f"Using device: {self.device}")
        
        #Load pre-trained model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)  # Move model to GPU if available

        #Convert model to half precision if using GPU to reduce memory usage
        if self.device == "cuda":
            self.model = self.model.half()
        
        # Set model to evaluation mode
        self.model.eval()

    def _batch_tokenize(self, texts: List[str]):
        """
        Tokenize a batch of texts
        
        Args:
            texts (List[str]): List of input texts to tokenize
            
        Returns:
            dict: Tokenized inputs including input_ids and attention_mask
        """
        return self.tokenizer(texts, 
                            max_length=self.max_length, 
                            padding=True, 
                            truncation=True, 
                            return_tensors="pt")

    def process_reviews(self, df, output_file, chunk_size):
        """
        Process reviews in chunks and batches to handle large datasets efficiently
        
        Args:
            df (pd.DataFrame): DataFrame containing reviews to analyze
            output_file (str): Path to save results
            chunk_size (int): Number of reviews to process before saving to disk
        """
        total_rows = len(df)
        results = []
        
        #Process DataFrame in chunks to manage memory
        for start_idx in tqdm(range(0, total_rows, chunk_size)):
            end_idx = min(start_idx + chunk_size, total_rows)
            chunk_df = df.iloc[start_idx : end_idx]
            
            #Process each chunk in smaller batches
            for batch_start in range(0, len(chunk_df), self.batch_size):
                batch_end = min(batch_start + self.batch_size, len(chunk_df))
                batch_df = chunk_df.iloc[batch_start : batch_end]

                #Extract text and IDs from batch
                texts = batch_df["text"].tolist()
                review_ids = batch_df.index.tolist()
                
                #Tokenize the batch
                encoded = self._batch_tokenize(texts)
                input_ids = encoded["input_ids"].to(self.device)
                attention_mask = encoded["attention_mask"].to(self.device)
                
                #Generate predictions
                with torch.no_grad():  # Disable gradient calculation for inference
                    if self.device == "cuda":
                        # onvert only attention mask to half precision
                        attention_mask = attention_mask.half()

                    #Get model outputs and convert to probabilities
                    outputs = self.model(input_ids, attention_mask=attention_mask)
                    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                    # Get binary predictions (positive/negative) and confidence scores
                    batch_preds = (probs[:, 1] > 0.5).cpu().numpy()
                    batch_confs = probs.max(dim=1)[0].cpu().numpy()
                
                #Store batch results in DataFrame
                batch_results = pd.DataFrame({
                    "review_id": review_ids,
                    "sentiment": ["positive" if pred else "negative" for pred in batch_preds],
                    "confidence": batch_confs
                })
                results.append(batch_results)
                
                #Free up GPU memory after each batch
                if self.device == "cuda":
                    torch.cuda.empty_cache()
            
            #Save results for current chunk and clear memory
            pd.concat(results).to_csv(output_file, mode="a", header=not bool(start_idx), index=False)
            results = []

            #Run garbage collection to free memory
            gc.collect()  

##### ***Connect to MongoDB and load the reviews***

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["yelp"]

reviews = db["reviews"].find({}, {
    "_id" : 0,
    "review_id" : 1,
    "text" : 1
})

reviews = pd.DataFrame(reviews)
reviews = reviews.set_index("review_id")
reviews

Unnamed: 0_level_0,text
review_id,Unnamed: 1_level_1
KU_O5udG6zpxOg-VcAEodg,"If you decide to eat here, just be aware it is..."
BiTunyQ73aT9WBnpR9DZGw,I've taken a lot of spin classes over the year...
saUsX_uimxRlCVr67Z4Jig,Family diner. Had the buffet. Eclectic assortm...
AqPFMleE6RsU23_auESxiA,"Wow! Yummy, different, delicious. Our favo..."
Sx8TMOWLNuJBWer-0pcmoA,Cute interior and owner (?) gave us tour of up...
...,...
H0RIamZu0B0Ei0P4aeh3sQ,Latest addition to services from ICCU is Apple...
shTPgbgdwTHSuU67mGCmZQ,"This spot offers a great, affordable east week..."
YNfNhgZlaaCO5Q_YJR4rEw,This Home Depot won me over when I needed to g...
i-I4ZOhoX70Nw5H0FwrQUA,For when I'm feeling like ignoring my calorie-...


##### ***Make predictions using the model***

In [11]:
analyzer = SentimentAnalyzer("distilbert-base-uncased-finetuned-sst-2-english", batch_size = 512, max_length = 128)

analyzer.process_reviews(reviews, "sentiment_results_v2.csv", 10000)

INFO:__main__:Using device: cuda
100%|██████████| 700/700 [18:54:25<00:00, 97.24s/it]   


In [None]:
#If you want to retreive the sentiments from file, you need to execute this cell
sentiment = pd.read_csv("sentiment_results.csv")
sentiment = sentiment.set_index("review_id")
sentiment_dict = sentiment.to_dict(orient = "index")
sentiment_dict

##### ***Loading the results into MongoDB (```reviews``` collection)***

In [None]:
#We'll use batches to boost updating process
batch_size = 10000

sentiment_list = list(sentiment_dict.items())
num_batches = math.ceil(len(sentiment_list) / batch_size)

with tqdm(total = num_batches) as pbar:

    for i in range(0, len(sentiment_list), batch_size):
        batch_items = sentiment_list[i : i + batch_size]
        
        #To update efficently all the documents, we'll use "bulk_write" that is able to minimize the number of db operations
        db["reviews"].bulk_write([
                                    pymongo.UpdateOne({
                                                          "review_id" : review_id
                                                      },
                                                      {
                                                          "$set" : data
                                                      })
                                                      for review_id, data in batch_items
                                ])
        
        pbar.update(1)

##### ***Loading the results into MongoDB (```businesses_merged``` collection)***

In [None]:
review_to_business_map = db["reviews"].find({},
                                            {   
                                                "_id" : 0,
                                                "review_id" : 1,
                                                "business_id" : 1
                                            })

review_to_business_map = pd.DataFrame(review_to_business_map).set_index("review_id")["business_id"].to_dict()

In [None]:
#To speed up update process, we'll create atemporary index on "reviews.review_id". It is unique and "sparse" because some documents might not have "reviews" subcollection
db["businesses_merged"].create_index(["reviews.review_id"], unique = True, sparse = True)

In [None]:
batch_size = 2500

sentiment_list = list(sentiment_dict.items())
num_batches = math.ceil(len(sentiment_list) / batch_size)

with tqdm(total = num_batches) as pbar:

    for i in range(0, len(sentiment_list), batch_size):
        batch_items = sentiment_list[i : i + batch_size]
        
        #To update efficently all the documents, we'll use "bulk_write" that is able to minimize the number of db operations
        db["businesses_merged"].bulk_write([
                                                  pymongo.UpdateOne({
                                                                        #First finnd the document using "business_id", then find the element in the embedded array using "review_id"  
                                                                        "business_id" : review_to_business_map[review_id],
                                                                        "reviews.review_id" : review_id 
                                                                    },
                                                                    {
                                                                        "$set" : {
                                                                            #$ is used to update only the matched document(s)
                                                                            "reviews.$.sentiment" : data["sentiment"],
                                                                            "reviews.$.confidence" : data["confidence"]  
                                                                        }
                                                                    })
                                                                    for review_id, data in batch_items
                                          ])
        
        pbar.update(1)

In [12]:
##### ***Loading the results into MongoDB (```users_merged``` collection)***

In [None]:
review_to_user_map = db["reviews"].find({},
                                            {   
                                                "_id" : 0,
                                                "review_id" : 1,
                                                "user_id" : 1
                                            })

review_to_user_map = pd.DataFrame(review_to_user_map).set_index("review_id")["user_id"].to_dict()

In [None]:
#To speed up update process, we'll create atemporary index on "reviews.review_id". It is unique and "sparse" because some documents might not have "reviews" subcollection
db["users_merged"].create_index(["reviews.review_id"], unique = True, sparse = True)

In [None]:
batch_size = 2500

sentiment_list = list(sentiment_dict.items())
num_batches = math.ceil(len(sentiment_list) / batch_size)

with tqdm(total = num_batches) as pbar:

    for i in range(0, len(sentiment_list), batch_size):
        batch_items = sentiment_list[i : i + batch_size]
        
        #To update efficently all the documents, we'll use "bulk_write" that is able to minimize the number of db operations
        db["users_merged"].bulk_write([
                                                  pymongo.UpdateOne({
                                                                        #First finnd the document using "user_id", then find the element in the embedded array using "review_id"  
                                                                        "user_id" : review_to_user_map[review_id],
                                                                        "reviews.review_id" : review_id 
                                                                    },
                                                                    {
                                                                        "$set" : {
                                                                            #$ is used to update only the matched document(s)
                                                                            "reviews.$.sentiment" : data["sentiment"],
                                                                            "reviews.$.confidence" : data["confidence"]  
                                                                        }
                                                                    })
                                                                    for review_id, data in batch_items
                                          ])
        
        pbar.update(1)