In [2]:
import pandas as pd

### Dataset merging into one csv

In [2]:
def merge_dfs(df_list: list[pd.DataFrame]) -> pd.DataFrame:
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df.drop_duplicates(subset=["content"], inplace=True)
    merged_df.dropna(subset=["content"], inplace=True)
    merged_df.to_csv("./datasets/merged_reviews.csv", index_label="id")
    return merged_df

In [3]:
df_list = [
    pd.read_csv("./datasets/Al_Quran_filtered_reviews.csv"),
    pd.read_csv("./datasets/Muslim_Bangla_filtered_reviews.csv"),
    pd.read_csv("./datasets/Muslim_Day_filtered_reviews.csv"),
]

In [4]:
# merged_df = merge_dfs(df_list)
merged_df = pd.read_csv("./datasets/merged_reviews.csv")
merged_df.shape

(74285, 5)

### LLM prompting

#### Output schema

In [7]:
from pydantic import BaseModel
from typing import Literal
import json


sentiment_types = Literal["positive", "negative", "neutral"]


class Aspect(BaseModel):
    aspect: str
    polarity: sentiment_types


class Output(BaseModel):
    translation: str = ""
    overall_sentiment: sentiment_types
    aspects: list[Aspect]


output_schema = Output.model_json_schema()
print(json.dumps(output_schema, indent=2))

{
  "$defs": {
    "Aspect": {
      "properties": {
        "aspect": {
          "title": "Aspect",
          "type": "string"
        },
        "polarity": {
          "enum": [
            "positive",
            "negative",
            "neutral"
          ],
          "title": "Polarity",
          "type": "string"
        }
      },
      "required": [
        "aspect",
        "polarity"
      ],
      "title": "Aspect",
      "type": "object"
    }
  },
  "properties": {
    "translation": {
      "default": "",
      "title": "Translation",
      "type": "string"
    },
    "overall_sentiment": {
      "enum": [
        "positive",
        "negative",
        "neutral"
      ],
      "title": "Overall Sentiment",
      "type": "string"
    },
    "aspects": {
      "items": {
        "$ref": "#/$defs/Aspect"
      },
      "title": "Aspects",
      "type": "array"
    }
  },
  "required": [
    "overall_sentiment",
    "aspects"
  ],
  "title": "Output",
  "type": "object"
}


#### Prompting

In [None]:
from cerebras.cloud.sdk import Cerebras
import os, dotenv


dotenv.load_dotenv()

def get_sentiment(client: Cerebras, review: str) -> dict:
    chat_completion = client.chat.completions.create(
        model="llama3.3-70b",
        messages=[
            {
                "role": "system",
                "content": "You are good at aspect based sentiment analysis. You can extract the aspects and their polarity from the review text. The polarity can be positive, negative, or neutral.",
            },
            {
                "role": "user",
                "content": f"Extract the aspects and their polarity from the following review text enclosed in three backticks. Also, detect the overall sentiment of the review. If the review is not in English, then translate the review in English and give me the translation. Give me a json output as the response:\n```\n{review}\n```",
            }
        ],
        temperature=0,
        response_format={
            "type": "json_schema", 
            "json_schema": {
                "name": "output_schema",
                "strict": True,
                "schema": output_schema
            }
        }
    )

    output = chat_completion.choices[0].message.content
    return json.loads(output)

In [7]:
merged_df.columns

Index(['id', 'content', 'replyContent', 'score', 'app_name'], dtype='object')

In [8]:
from datetime import datetime
from tqdm import tqdm

In [9]:
import logging
from datetime import datetime
import os

# Create logs directory if it doesn't exist
os.makedirs("logs", exist_ok=True)

# Setup logger
logger = logging.getLogger("SentimentAnalysisLogger")
logger.setLevel(logging.INFO)  # Set to INFO to capture both INFO and ERROR

# Create file handler with time-stamped log filename
log_file = f"logs/sentiment_analysis_{datetime.now().strftime('%Y-%m-%d')}.log"
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)

# Create console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Add handlers to logger
logger.addHandler(file_handler)
# logger.addHandler(console_handler)

In [None]:
DATASET_PATH = f"./datasets/merged_reviews_with_sentiments.csv"

dataset_df = pd.DataFrame(columns=["id", "content", "replyContent", "score", "app_name", "translation", "overall_sentiment", "aspects"]) if not os.path.exists(DATASET_PATH) else pd.read_csv(DATASET_PATH)

not_processed_ids = sorted(set(merged_df["id"]) - set(dataset_df["id"]))

api_keys = [os.getenv(f"CEREBRAS_API_KEY_{i}") for i in range(6)]
current_api_key_index = 0
client = Cerebras(api_key=api_keys[current_api_key_index])
for id in tqdm(not_processed_ids, desc="Processing reviews"):
    _, content, replyContent, score, app_name = merged_df[merged_df["id"] == id].iloc[0]
    if content == "":
        continue
    
    while True:
        try:
            sentiment = get_sentiment(client, content)
            dataset_df.loc[len(dataset_df)] = [
                id,
                content,
                replyContent,
                score,
                app_name,
                sentiment.get("translation", ""),
                sentiment["overall_sentiment"],
                [
                    {
                        "aspect": aspect["aspect"],
                        "polarity": aspect["polarity"]
                    }
                    for aspect in sentiment["aspects"]
                ]
            ]
            dataset_df.to_csv(DATASET_PATH, index=False)
            logger.info(f"Processed id: {id}")
            break
        except Exception as e:
            dataset_df.to_csv(DATASET_PATH, index=False)
            error_msg = e.__str__()
            if "request_quota_exceeded" in error_msg:
                logger.error(f"Request quota exceeded for key serial {current_api_key_index}.")
                current_api_key_index += 1
                logger.info(f"Switching to API key serial {current_api_key_index}.")
            elif "token_quota_exceeded" in error_msg:
                logger.error(f"Token quota exceeded for key serial {current_api_key_index}. Please check your API key limits.")
                api_keys.pop(current_api_key_index)
            else:
                logger.error(f"An error occurred: {error_msg}")
                raise e

            if current_api_key_index >= len(api_keys):
                current_api_key_index = 0
            if len(api_keys) == 0:
                break
            client = Cerebras(api_key=api_keys[current_api_key_index])
    if len(api_keys) == 0:
        logger.error("No more API keys available. Exiting.")
        logger.info(f"Total reviews processed: {len(dataset_df)}")
        break

Processing reviews:  20%|█▉        | 8003/40350 [4:19:49<4:38:59,  1.93it/s]  

In [None]:
# RateLimitError: code 'token_quota_exceeded'
# RateLimitError: code 'request_quota_exceeded'
# Error code: 429 - {'message': 'Requests per hour limit exceeded - too many requests sent.', 'type': 'too_many_requests_error', 'param': 'quota', 'code': 'request_quota_exceeded'}

#### Experimentation

In [2]:
reviews = [
    "Very good application to learn Al Quran word by word.",
    "This app is the best and beautiful made. Thanks to those who made it. It really shows that you guys are doing your work for Allah ﷻ. And may Allah ﷻ reward you all.",
    "Best",
    "First of all, I thank Allah that we can read the Quran through apps.However, this app is different from other apps.In this app, you can read the Quran and learn about every sentence along with its meaning. I would like to thank the person who made this app.Above all, I pray that you may be engaged in all the works of Allah Almighty for the benefit of His servants, and that Allah may accept you as a responsible person.",
    "Wonderful quran app with a very clear translation and tafsir of the Quran.",
]

In [3]:
from cerebras.cloud.sdk import Cerebras
from pprint import pprint
from dotenv import load_dotenv
import os

load_dotenv()

True

In [8]:
def get_sentiment(client: Cerebras, review: str) -> dict:
    chat_completion = client.chat.completions.create(
        model="llama3.3-70b",
        messages=[
            {
                "role": "system",
                "content": "You are good at aspect based sentiment analysis. You can extract the aspects and their polarity from the review text. The polarity can be positive, negative, or neutral.",
            },
            {
                "role": "user",
                "content": f"Extract the aspects and their polarity from the following review text enclosed in three backticks. You must extract the aspect from the given text. Don't create additional words in the aspects. Also, detect the overall sentiment of the review. If the review is not in English, then translate the review in English and give me the translation. Give me a json output as the response:\n```\n{review}\n```",
            }
        ],
        temperature=0,
        response_format={
            "type": "json_schema", 
            "json_schema": {
                "name": "output_schema",
                "strict": True,
                "schema": output_schema
            }
        }
    )

    output = chat_completion.choices[0].message.content
    return json.loads(output)

In [9]:
client = Cerebras(api_key=os.getenv("CEREBRAS_API_KEY_0"))

In [None]:
for review in reviews:
    pprint(f"Review: {review}")
    pprint(f"ABSA output: {get_sentiment(client, review)}")
    print("-------------------------------------------------------------------------------")

'Review: Very good application to learn Al Quran word by word.'
("ABSA output: {'translation': None, 'overall_sentiment': 'positive', "
 "'aspects': [{'aspect': 'application', 'polarity': 'positive'}, {'aspect': "
 "'Al Quran', 'polarity': 'positive'}]}")
-------------------------------------------------------------------------------------------------------------------------------------------------------------
('Review: This app is the best and beautiful made. Thanks to those who made '
 'it. It really shows that you guys are doing your work for Allah ﷻ. And may '
 'Allah ﷻ reward you all.')
("ABSA output: {'translation': None, 'overall_sentiment': 'positive', "
 "'aspects': [{'aspect': 'app', 'polarity': 'positive'}, {'aspect': 'work', "
 "'polarity': 'positive'}]}")
-------------------------------------------------------------------------------------------------------------------------------------------------------------
'Review: Best'
("ABSA output: {'translation': None, 'overall_se