In [None]:


from transformers import AdamW, get_scheduler
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback
from google.colab import files

# Load the dataset
train_path = "Training.csv"  # Replace with your file path
test_path = "Testing.csv"    # Replace with your file path

# Read the CSV files
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


features = [
    "itching", "skin_rash", "nodal_skin_eruptions", "continuous_sneezing", "shivering", "chills",
    "joint_pain", "stomach_pain", "acidity", "ulcers_on_tongue", "muscle_wasting", "vomiting",
    "burning_micturition", "spotting_ urination", "fatigue", "weight_gain", "anxiety",
    "cold_hands_and_feets", "mood_swings", "weight_loss", "restlessness", "lethargy",
    "patches_in_throat", "irregular_sugar_level", "cough", "high_fever", "sunken_eyes",
    "breathlessness", "sweating", "dehydration", "indigestion", "headache", "yellowish_skin",
    "dark_urine", "nausea", "loss_of_appetite", "pain_behind_the_eyes", "back_pain",
    "constipation", "abdominal_pain", "diarrhoea", "mild_fever", "yellow_urine",
    "yellowing_of_eyes", "acute_liver_failure", "fluid_overload", "swelling_of_stomach",
    "swelled_lymph_nodes", "malaise", "blurred_and_distorted_vision", "phlegm",
    "throat_irritation", "redness_of_eyes", "sinus_pressure", "runny_nose", "congestion",
    "chest_pain", "weakness_in_limbs", "fast_heart_rate", "pain_during_bowel_movements",
    "pain_in_anal_region", "bloody_stool", "irritation_in_anus", "neck_pain", "dizziness",
    "cramps", "bruising", "obesity", "swollen_legs", "swollen_blood_vessels", "puffy_face_and_eyes",
    "enlarged_thyroid", "brittle_nails", "swollen_extremeties", "excessive_hunger",
    "extra_marital_contacts", "drying_and_tingling_lips", "slurred_speech", "knee_pain",
    "hip_joint_pain", "muscle_weakness", "stiff_neck", "swelling_joints", "movement_stiffness",
    "spinning_movements", "loss_of_balance", "unsteadiness", "weakness_of_one_body_side",
    "loss_of_smell", "bladder_discomfort", "foul_smell_of urine", "continuous_feel_of_urine",
    "passage_of_gases", "internal_itching", "toxic_look_(typhos)", "depression", "irritability",
    "muscle_pain", "altered_sensorium", "red_spots_over_body", "belly_pain", "abnormal_menstruation",
    "dischromic _patches", "watering_from_eyes", "increased_appetite", "polyuria", "family_history",
    "mucoid_sputum", "rusty_sputum", "lack_of_concentration", "visual_disturbances",
    "receiving_blood_transfusion", "receiving_unsterile_injections", "coma", "stomach_bleeding",
    "distention_of_abdomen", "history_of_alcohol_consumption", "fluid_overload",
    "blood_in_sputum", "prominent_veins_on_calf", "palpitations", "painful_walking",
    "pus_filled_pimples", "blackheads", "scurring", "skin_peeling", "silver_like_dusting",
    "small_dents_in_nails", "inflammatory_nails", "blister", "red_sore_around_nose",
    "yellow_crust_ooze"
]




# Balanced Few-Shot Sampling
def get_few_shot_data(df, n_shots=10):
    class_samples = []
    for prognosis in df['prognosis'].unique():
        available_samples = df[df['prognosis'] == prognosis]
        samples = available_samples.sample(n=min(len(available_samples), n_shots), random_state=42)
        class_samples.append(samples)
    few_shot_df = pd.concat(class_samples)
    return few_shot_df.reset_index(drop=True)

few_shot_train_df = get_few_shot_data(train_df)
few_shot_test_df = get_few_shot_data(test_df)

# Load pre-trained tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(few_shot_train_df['prognosis'].unique()))


# Compute Class Weights for Imbalanced Dataset
class_weights = compute_class_weight(
    'balanced',
    classes=few_shot_train_df['prognosis'].unique(),
    y=few_shot_train_df['prognosis']
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to("cuda")

# Preprocessing Function
def preprocess_function(examples):
    concatenated_features = [
        " ".join(map(str, [examples[feature][i] for feature in features]))
        for i in range(len(examples[features[0]]))
    ]
    tokenized = tokenizer(concatenated_features, padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = [few_shot_train_df['prognosis'].unique().tolist().index(label) for label in examples["prognosis"]]
    return tokenized

train_dataset = Dataset.from_pandas(few_shot_train_df).map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(few_shot_test_df).map(preprocess_function, batched=True)

# Custom Loss Function
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,  # Lower batch size for few-shot training
    num_train_epochs=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Use mixed precision for faster training on CUDA
    weight_decay=0.01,  # Added for regularization
    save_total_limit=3,  # Limit the number of saved checkpoints
)

# Early Stopping Callback to monitor evaluation loss and stop early if no improvement
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations with no improvement before stopping
    early_stopping_threshold=0.01  # Threshold for considering loss improvement
)

# Define trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Add the early stopping callback here
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)
results_df = pd.DataFrame([results])

# Export the results to an Excel file
results_df.to_excel('/content/evaluation_results.xlsx', index=False)

# Download the Excel file to your local machine
files.download('/content/evaluation_results.xlsx')


ModuleNotFoundError: No module named 'pandas'