In [14]:
import sqlite3
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
from tqdm import tqdm

In [15]:
# Connect to your SQLite database and load data
conn = sqlite3.connect("Auto_Reply_v3.db")
queries = [
    'SELECT "translation" AS Review, "Developer Reply Text" AS Reply FROM All_Reviews',
    'SELECT "translation" AS Review, "Reply" AS Reply FROM "Auto Reply Hadith"',
    'SELECT "translation" AS Review, "Reply" AS Reply FROM Auto_Reply_Reviews',
    'SELECT "translation" AS Review, "Reply" AS Reply FROM Auto_Reply_Reviews_Al_Quran'
]
dfs = [pd.read_sql_query(q, conn) for q in queries]
df = pd.concat(dfs, ignore_index=True)
conn.close()



In [16]:
# Check CUDA availability and select device
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Using CPU.")


CUDA is available. Using GPU: NVIDIA GeForce RTX 4070 Ti SUPER


In [17]:
# Model and tokenizer for 3-label sentiment
# model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# sentiment_analyzer = pipeline(
#     'sentiment-analysis',
#     model=model_name,
#     tokenizer=tokenizer,
#     device=0 if device == "cuda" else -1,
# )
# print(f"Sentiment pipeline will run on: {device}")

# # Map model labels to human-readable
# label_map = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}

# # Sentiment analysis with tokenizer-based truncation

# def analyze_sentiment(text):
#     if not isinstance(text, str) or not text.strip():
#         return None
#     result = sentiment_analyzer(
#         text,
#         truncation=True,
#         max_length=512
#     )
#     label = result[0]['label']
#     return label_map.get(label, label)

# # Apply sentiment and save to CSV
# df['sentiment'] = df['Review'].apply(analyze_sentiment)
# df.to_csv('robertaXLM.csv', index=False)
# print(f"Processed {len(df)} rows. Results saved to robertaXLM.csv.")


In [18]:
# Model and tokenizer for 3-label sentiment
# model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
model_name = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analyzer = pipeline(
    'sentiment-analysis',
    model=model_name,
    tokenizer=tokenizer,
    device=0 if device == "cuda" else -1,
)
print(f"Sentiment pipeline will run on: {device}")

# Map model labels to human-readable
label_map = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}

# Sentiment analysis with tokenizer-based truncation
def analyze_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return None
    result = sentiment_analyzer(
        text,
        truncation=True,
        max_length=512
    )
    label = result[0]['label']
    return label_map.get(label, label)

# Add tqdm for progress bar while applying sentiment analysis
df['sentiment'] = [analyze_sentiment(review) for review in tqdm(df['Review'], desc="Analyzing Sentiment")]

# Save to CSV
df.to_csv('robertaXLM.csv', index=False)
print(f"Processed {len(df)} rows. Results saved to robertaXLM.csv.")

Device set to use cuda:0


Sentiment pipeline will run on: cuda


Analyzing Sentiment: 100%|██████████| 70460/70460 [03:10<00:00, 370.49it/s]


Processed 70460 rows. Results saved to robertaXLM.csv.
