# Model Training Using Pytorch

In [None]:
# !pip install wandb
# import pandas as pd
# from datasets import Dataset
# from transformers import AutoTokenizer
# from sklearn.preprocessing import LabelEncoder
# import torch
# from torch.utils.data import DataLoader
# from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler
# from tqdm.auto import tqdm
# from transformers import TrainingArguments, Trainer
# import wandb
# import numpy as np

# # Initialize WandB with your API key
# wandb.login(key="92009a9c5dd6b5b7a30a3f921b700b85af6651cb")

# # Load your dataset from CSV
# data = pd.read_csv('/kaggle/input/dianostic-final-dataset/final_dataset.csv')

# # Print data types of each column in the dataset
# print("Data types of each column in the dataset:")
# print(data.dtypes)
# print()

# # Encode labels as integers
# label_encoder = LabelEncoder()
# data['subreddit'] = label_encoder.fit_transform(data['subreddit'])

# # Load BERT tokenizer
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

# # Tokenize function
# def tokenize_function(examples):
#     return tokenizer(examples['post'], padding='max_length', truncation=True)

# # Convert Pandas DataFrame to Dataset
# dataset = Dataset.from_pandas(data)

# # Tokenize dataset
# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Add labels to the tokenized dataset
# tokenized_datasets = tokenized_datasets.add_column("labels", data['subreddit'].tolist())

# # Split into train and test datasets
# train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
# train_dataset = train_test_split['train']
# eval_dataset = train_test_split['test']

# # Define DataLoader
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
# eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# # Load model for sequence classification
# model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))

# # Define optimizer
# optimizer = AdamW(model.parameters(), lr=5e-5)

# # Define learning rate scheduler
# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )

# # Install and import the evaluate module
# !pip install evaluate
# import evaluate

# # Define metric
# metric = evaluate.load("accuracy")

# # Compute metrics function
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="/kaggle/working/",
#     evaluation_strategy="steps",
#     eval_steps=1000,
#     logging_steps=1000,
#     num_train_epochs=num_epochs,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='/kaggle/working/logs',
# )

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
# )

# # Train the model with WandB integration
# trainer.train()

# # Finish WandB run
# wandb.finish()


In [None]:
# # Save the model and tokenizer
# model.save_pretrained("/kaggle/working/saved_model")
# tokenizer.save_pretrained("/kaggle/working/saved_model")

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # Load the model and tokenizer
# model_path = "/kaggle/working/saved_model"
# print(f"Loading model from: {model_path}")
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# print("Model loaded successfully.")

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# print("Tokenizer loaded successfully.")

# # Example usage:
# input_text = "Losing it. Lately I haven’t felt like a real person. And I don’t know if that’s the right way to describe it. I’m always in my head, always thinking and overthinking but never ever about anything that actually matters. Nothing excites me anymore, I can’t focus on anything for too long. All I do is sleep, I am exhausted 99% of the time. When I’m not at work I’ll sometimes sleep for literally my entire off day. I can’t keep up with chores. I have a hard time showering/brushing my teeth which makes me feel so disgusting. I hate my job. I’m getting very bored in my relationship even though I love him to death. I feel very alone because he doesn’t get it and we go through the same motions and conversations every day. Basically I don’t have any plans or directions for my life, everything scares me, and I have no idea what to do. Nothing seems to help and no one seems to understand. I just downloaded this app and decided to post here because I have no one to talk to and I am so scared."
# print(f"Input text: {input_text}")

# inputs = tokenizer(input_text, return_tensors="pt")
# print("Tokenized input:", inputs)

# outputs = model(**inputs)
# print("Model outputs:", outputs)

# import torch
# import numpy as np

# # Assuming 'outputs' is the SequenceClassifierOutput object as shown
# logits = outputs.logits
# predictions = torch.argmax(logits, dim=-1)
# predicted_class_index = predictions.item()

# # Decode the predicted class using label_encoder
# predicted_class = label_encoder.classes_[predicted_class_index]

# print("Predicted Class:", predicted_class)


In [None]:
# !zip -r file.zip /kaggle/working/saved_model

In [None]:
# from IPython.display import FileLink
# FileLink(r'file.zip')

In [None]:
# !pip list

# Model Training using Tensorflow

In [None]:
import tensorflow as tf
import pandas as pd
from transformers import TFBertForSequenceClassification, BertTokenizerFast

# Load dataset
df = pd.read_csv('/kaggle/input/dianostic-final-dataset/final_dataset.csv')

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('google-bert/bert-base-multilingual-cased')

label_map = {"addiction": 0, "anxiety": 1, "depression": 2, "PTSD": 3}

# Load the model
model = TFBertForSequenceClassification.from_pretrained('google-bert/bert-base-multilingual-cased', num_labels=len(label_map))

# Define a simple function to tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors='tf')

# Prepare the data
def prepare_dataset(df, label_map):
    texts = df['post'].tolist()
    labels = df['subreddit'].map(label_map).tolist()
    
    encodings = tokenize_function(texts)
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    return dataset

train_dataset = prepare_dataset(train_df, label_map)
eval_dataset = prepare_dataset(eval_df, label_map)

# Shuffle and batch the datasets
train_dataset = train_dataset.shuffle(len(train_df)).batch(8)
eval_dataset = eval_dataset.batch(8)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
num_epochs = 3
model.fit(
    train_dataset,
    validation_data=eval_dataset,
    epochs=num_epochs
)


In [None]:
model.save('/kaggle/working/diagnosis_model_bert-base-multilingual-cased')

In [None]:
tokenizer.save_pretrained('/kaggle/working/diagnosis_model_bert-base-multilingual-cased')

In [None]:
new_model = tf.keras.models.load_model('/kaggle/working/diagnosis_model_bert-base-multilingual-cased')
new_model.summary()

In [None]:
import tensorflow as tf
from transformers import BertTokenizerFast
import numpy as np

# Load the fine-tuned model
new_model = tf.keras.models.load_model('/kaggle/working/diagnosis_model_bert-base-multilingual-cased')

# Prepare the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('/kaggle/working/diagnosis_model_bert-base-multilingual-cased')

# Define a function to tokenize new input data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors='tf')

# Example input
new_texts = ["i am feeling sad and lonely", "i was sexually assualted when i was 8 years old, i can not cope up with that"]

# Tokenize the input texts
new_encodings = tokenize_function(new_texts)

# Make predictions
predictions = new_model.predict(dict(new_encodings))

# Extract logits from predictions dictionary
logits = predictions['logits']

# Apply softmax to get probabilities
probs = tf.nn.softmax(logits, axis=-1)

# Determine the predicted labels
predicted_labels = np.argmax(probs, axis=1)

# Assuming you have a label map
label_map = {
    'Addiction': 0,
    'Anxiety': 1,
    'Depression': 2,
    'PTSD': 3
}

# Create a reverse label map
reverse_label_map = {v: k for k, v in label_map.items()}

# Get the class names of the predicted labels
predicted_class_names = [reverse_label_map[label] for label in predicted_labels]

# Print the predicted labels and their class names
print(predicted_labels)
print(predicted_class_names)


In [None]:
!zip -r file.zip /kaggle/working/diagnosis_model_bert-base-multilingual-cased

In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [None]:
from huggingface_hub import login
login(token="YOUR_HUGGINGFACE_TOKEN", add_to_git_credential=True, new_session=False)



In [None]:
model.push_to_hub("SiddharthShukla48/MindAid_Diagnosis_bert-base-multilingual-cased")

tokenizer.push_to_hub("SiddharthShukla48/MindAid_Diagnosis_bert-base-multilingual-cased")


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("SiddharthShukla48/MindAid_Diagnosis_bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("SiddharthShukla48/MindAid_Diagnosis_bert-base-multilingual-cased")

In [None]:
!pip list