In [None]:
# Cell 1: Install dependencies
!pip install transformers torch pandas scikit-learn


In [None]:
# Cell 2: Import libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Cell 3: Load dataset
# Option 1: Load from CSV
df = pd.read_csv('hindi_marathi_reviews.csv')
# Option 2: Create sample dataset
data = {
    'text': [
        'कैमरा बहुत बढ़िया है फोटो क्वालिटी शानदार',
        'बैटरी बैकअप बहुत कम है',
        'परफॉर्मेंस एकदम जबरदस्त कोई लैग नहीं',
        'कीमत बहुत ज्यादा है वैल्यू फॉर मनी नहीं',
        'डिस्प्ले बहुत अच्छा है ब्राइटनेस भी बढ़िया',
        'बिल्ड क्वालिटी खराब है प्लास्टिक जैसा लगता है',
        'कॅमेरा खूप छान आहे फोटो क्वालिटी उत्तम',
        'बॅटरी बॅकअप कमी आहे',
        'परफॉर्मन्स उत्कृष्ट आहे गेमिंग चांगली चालते'
    ],
    'label': [2, 0, 2, 0, 2, 0, 2, 0, 2]  # 0=negative, 1=neutral, 2=positive
}
df = pd.DataFrame(data)

In [None]:
# Cell 4: Prepare data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# Cell 5: Load tokenizer and model
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

In [None]:
# Cell 6: Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [None]:
# Cell 7: Create dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

In [None]:
# Cell 8: Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [None]:
# Cell 9: Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


In [None]:
# Cell 10: Save model
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

In [None]:
# Cell 11: Test predictions
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    sentiment = torch.argmax(probs, dim=1).item()
    
    labels = ['Negative', 'Neutral', 'Positive']
    return labels[sentiment], probs[0].tolist()

# Test
test_text = "कैमरा बहुत बढ़िया है"
sentiment, probs = predict_sentiment(test_text)
print(f"Text: {test_text}")
print(f"Sentiment: {sentiment}")
print(f"Probabilities: {probs}")