### LLAMA model 
- Classify the seriousness of the subredits
- Classify the reddit post  


In [None]:
%pip install torch torchvision torchaudio
%pip install transformers
# Make sure you have Hugging Face account and token for accessing LLaMA models
%pip install huggingface_hub


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Download the model locally 

In [6]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Authenticate with Hugging Face
login(token=os.getenv("HugginFace_Token"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Fasfous\.cache\huggingface\token
Login successful


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from tqdm import tqdm



# Specify the model repository
model_name = "meta-llama/Llama-3.2-1B"

# Download the tokenizer and model with progress tracking
print("Downloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, progress_bar=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    local_files_only=False,  # This means it will download the model from HuggingFace Hub
    token=True,  # If necessary for private models
)


Downloading tokenizer...


In [17]:
# Save locally if needed
print("Saving model locally...")
model.save_pretrained("./Llama-3.2-1B")

print("Saving tokenizer locally...")
tokenizer.save_pretrained("./Llama-3.2-1B")

Saving model locally...
Saving tokenizer locally...


('./Llama-3.2-1B\\tokenizer_config.json',
 './Llama-3.2-1B\\special_tokens_map.json',
 './Llama-3.2-1B\\tokenizer.json')

### Fine Tunning the Model 


In [1]:
from datasets import load_dataset

# Load Reddit dataset (example: pushshift dataset)
dataset = load_dataset("jsfactory/mental_health_reddit_posts")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(len(dataset["train"]))
print(len(dataset))

24000
1


In [2]:

# Take only 10% of the dataset
dataset = dataset['train'].shuffle(seed=42).select(range(int(0.1 * len(dataset["train"]))))



In [3]:
dataset# Split into train and eval (80% for training, 20% for evaluation)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [5]:
print(len(train_dataset))
print(eval_dataset)

1920
Dataset({
    features: ['body', 'subreddit'],
    num_rows: 480
})


In [15]:
from transformers import AutoTokenizer

def preprocess_function(examples):
    inputs = examples["body"]
    inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs


def preprocess_data(example):
    return tokenizer(
        example["body"],  # Replace "body" with the field containing the Reddit post
        truncation=True,
        padding="max_length",
        max_length=512,  # Adjust based on LLaMA's context length
    )

model_path = "./Llama-3.2-1B"
# Map the preprocessing function to the dataset
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token if not defined

tokenized_test_datasets = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_datasets = eval_dataset.map(preprocess_function, batched=True)


train_dataset = train_dataset.map(preprocess_data, batched=True)
eval_dataset = eval_dataset.map(preprocess_data, batched=True)




[A
[A
Map: 100%|██████████| 1920/1920 [00:00<00:00, 6075.32 examples/s]

Map: 100%|██████████| 480/480 [00:00<00:00, 5922.89 examples/s]

[A
[A
Map: 100%|██████████| 1920/1920 [00:00<00:00, 3522.90 examples/s]

[A
Map: 100%|██████████| 480/480 [00:00<00:00, 3220.79 examples/s]


#### use CPU

In [6]:
from transformers import AutoModelForCausalLM

# Load your LLaMA model
model = AutoModelForCausalLM.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))  # Adjust token embeddings if vocabulary changes


Embedding(128256, 2048)

In [7]:
# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",  # Output directory for model checkpoints
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",  # Save model after each epoch
    save_total_limit=2,  # Limit to two model checkpoints
)


In [None]:

# Initialize the Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

# Create a DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Pass the eval dataset here
    data_collator=data_collator,
)

# Start training
trainer.train()


#### Use GPU

In [5]:
import torch
import os


print("Torch CUDA available:", torch.cuda.is_available())
print("Torch CUDA version:", torch.version.cuda)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device found")


Torch CUDA available: True
Torch CUDA version: 12.1
Device name: NVIDIA GeForce GTX 1050


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import torch

# Ensure CUDA is available
if not torch.cuda.is_available():
    raise EnvironmentError("GPU is not available. Ensure CUDA is installed and PyTorch is using it.")


In [7]:

# Load the model and tokenizer
model_path = "./Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [8]:

# Create a DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [18]:

# Define Training Arguments with GPU support
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",  # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,   # Batch size per GPU
    num_train_epochs=3,
    save_strategy="epoch",           # Save model after each epoch
    save_total_limit=2,              # Limit to two model checkpoints
    logging_dir="./logs",            # Directory for logs
    logging_steps=10,
    report_to="none",                # Disable reporting to external services like WandB
    fp16=True,                       # Enable mixed precision for faster training on GPUs
)




In [19]:
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_test_datasets,
    eval_dataset=tokenized_eval_datasets,
    data_collator=data_collator,
    processing_class=tokenizer,
)



    # Start training
trainer.train()




OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 9.23 GiB is allocated by PyTorch, and 929.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

another test for GPU

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import torch

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [22]:
import torch

# Clear cache
torch.cuda.empty_cache()

# Reset memory
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

print("CUDA memory cleaned.")

CUDA memory cleaned.


In [23]:

# Load the model and tokenizer
model_path = "./Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_path)


: 

In [21]:

# Create a DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,  # Enable mixed precision for faster GPU training
)




In [None]:

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

# Start training
trainer.train()


In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

# Set the environment variable to avoid OpenMP runtime error
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Set the environment variable to avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear cache
torch.cuda.empty_cache()

# Load the model and tokenizer
model_path = "./Llama-3.2-1B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Reduce the batch size in your training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Adjust this value based on your GPU memory
    per_device_eval_batch_size=4,   # Adjust this value based on your GPU memory
    num_train_epochs=1,
    logging_dir="./logs",
)


: 

In [None]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

# Start training
trainer.train()

In [None]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

# Start training
trainer.train()

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the current device ID
    current_device = torch.cuda.current_device()
    # Get the name of the GPU
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Using GPU: {gpu_name} (Device ID: {current_device})")
else:
    print("CUDA is not available. Using CPU.")

Using GPU: NVIDIA GeForce GTX 1050 (Device ID: 0)


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the tokenizer and model (already saved locally)
tokenizer = AutoTokenizer.from_pretrained("./Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("./Llama-3.2-1B")

# Set pad token ID to eos token ID if pad token ID is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Test the model
input_text = (
    "Answer the following question with a single word: yes or no.\n"
    "Question: Is Paris the Capital of Spain?"
)
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Create attention mask
attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

# Generate a response
output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

Answer the following question with a single word: yes or no.
Question: Is Paris the Capital of Spain?


#### Connect to Mongdb

In [1]:
from dotenv import load_dotenv
from pymongo import MongoClient

# Load environment variables from .env file
load_dotenv()
mongo='127.0.0.1'

try:
    # Connect to MongoDB
    myclient = MongoClient(
                        "mongodb://"+mongo+":27017/",  
                        username='admin',
                        password='admin') #Mongo URI format
    db=myclient['reddit']
    print("Connected to MongoDB successfully!")
except Exception as e:
    print("An error occurred while connecting to MongoDB:", e)

Connected to MongoDB successfully!


In [2]:
#import
import pandas as pd
posts_registered=[]
query=db.reddit_posts.find({},{'_id':0})
for el in query:
    posts_registered.append(el)
posts_registered_df=pd.DataFrame(posts_registered)
posts_registered_df.head()

Unnamed: 0,id,title,author,score,num_comments,upvote_ratio,url,subreddit,created_at,self_text,searchQuery
0,1et3kj0,Diagnosed with Inattentive ADHD at 31. Explain...,amadnomad,392,107,0.98,https://www.reddit.com/r/ADHD/comments/1et3kj0...,ADHD,1723749000.0,Please go out and get tested if you are still ...,adhd
1,1gk5ftv,Children with higher IQ scores were diagnosed ...,Pretend_Voice_3140,4499,466,0.99,https://www.reddit.com/r/ADHD/comments/1gk5ftv...,ADHD,1730809000.0,A study was published in the [British Journal ...,adhd
2,1f0k9en,Reminder: If you made it to adulthood with lat...,Hipster-Deuxbag,7372,535,0.99,https://www.reddit.com/r/ADHD/comments/1f0k9en...,ADHD,1724547000.0,"We all know the statistics: 20,000 behavioral ...",adhd
3,1g7tbb2,What is the most adhd thing you have ever adhd’ed,FamiliarRadio9275,2102,1022,0.99,https://www.reddit.com/r/ADHD/comments/1g7tbb2...,ADHD,1729409000.0,I laugh so hard when I look back at this memor...,adhd
4,1dy8uqs,I’m angry that no one recognized that I had AD...,thecynicalone26,3345,598,0.99,https://www.reddit.com/r/ADHD/comments/1dy8uqs...,ADHD,1720445000.0,"I just got diagnosed, and I’m 39. My entire l...",adhd


In [3]:
print(posts_registered_df.iloc[0]['title'])
print(posts_registered_df.iloc[0]['self_text'])
title_to_test=posts_registered_df.iloc[0]['title']
text_to_test=posts_registered_df.iloc[0]['self_text']

Diagnosed with Inattentive ADHD at 31. Explains so many things from my childhood.
Please go out and get tested if you are still on the fence. I always assumed ADHD was only hyperactive. A lot of concerns about day dreaming, zoning out and inattentiveness came into play during my consult. I didn't even consider my lack of sleep being tied to ADHD. But now that I have a diagnoses, it explains quite a bit from my past. I wasn't just lazy and disorganized. 

  
Again, please go get tested if you suspect anything.


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the tokenizer and model (already saved locally)
tokenizer = AutoTokenizer.from_pretrained("./Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("./Llama-3.2-1B")

# Set pad token ID to eos token ID if pad token ID is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Test the model
input_text = f"""Can you help me with this question? Please analyze the following and provide insights:
Title: {title_to_test}
Text: {text_to_test}

Provide an insightful and concise analysis below:
"""

input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Create attention mask
attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

# Generate a response
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=200,  # Length of the response
    temperature=0.7,  # Balance between random and deterministic
    top_p=0.9,        # Focus on top 90% of likely tokens
    pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)



Can you help me with this question? Please analyze the following and provide insights:
Title: Diagnosed with Inattentive ADHD at 31. Explains so many things from my childhood.
Text: Please go out and get tested if you are still on the fence. I always assumed ADHD was only hyperactive. A lot of concerns about day dreaming, zoning out and inattentiveness came into play during my consult. I didn't even consider my lack of sleep being tied to ADHD. But now that I have a diagnoses, it explains quite a bit from my past. I wasn't just lazy and disorganized. 

  
Again, please go get tested if you suspect anything.

Provide an insightful and concise analysis below:

