In [None]:
%pip install datasets
%pip install trl
%pip install accelerate
%pip install transformers[torch]
%pip install bitsandbytes

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
my_model = "LLM"

```python
configuration = MistralConfig(
    vocab_size=32000,  # Vocabulary size, yeh batata hai ki model kitne unique shabdon ko jaanta hai. Is case mein, 32000 shabdon ka vocab hai.
    hidden_size=2048,  # Hidden layers ka size, yeh batata hai ki model ke hidden layers mein kitne neurons hain. Yahan, 2048 neurons hain.
    intermediate_size=7168,  # Intermediate layer ka size, yeh batata hai ki model ke intermediate layer mein kitne neurons hain. Yahan, 7168 neurons hain.
    num_hidden_layers=24,  # Model ke andar kitne layers hain, yeh batata hai. Is case mein, 24 layers hain.
    num_attention_heads=32,  # Attention heads ki sankhya, yeh batata hai ki model kitne parts mein apna focus divide karta hai. Yahan, 32 attention heads hain.
    num_key_value_heads=8,  # Yeh specialized attention heads hain jo context samajhne mein madad karte hain. Is case mein, 8 heads hain.
    hidden_act="silu",  # Activation function ka type jo model use karta hai. Yahan "silu" use ho raha hai.
    max_position_embeddings=4096,  # Ek single text piece mein maximum kitne characters ho sakte hain, yeh batata hai. Yahan, 4096 characters ho sakte hain.
    pad_token_id=2,  # Padding characters ka ID, jo text ko equal length banane ke liye use hota hai.
    bos_token_id=1,  # Sentence ke start ka ID, jo batata hai ki sentence kahan se shuru hota hai.
    eos_token_id=2  # Sentence ke end ka ID, jo batata hai ki sentence kahan khatam hota hai.
)


In [None]:
# Importing the necessary libraries
import torch
from transformers import TrainingArguments, MistralForCausalLM, MistralConfig, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer

# Setting up the model configuration
# We're configuring a smaller version of the model to fit our needs
configuration = MistralConfig(
    vocab_size=32000,  # Vocabulary size, how many unique words the model knows
    hidden_size=2048,  # Size of the hidden layers in the model
    intermediate_size=7168,  # Size of the middle layer in the model
    num_hidden_layers=24,  # Number of layers in the model
    num_attention_heads=32,  # How many 'attention' parts the model divides its focus into
    num_key_value_heads=8,  # Specialized attention for understanding context
    hidden_act="silu",  # Type of activation function used in the model
    max_position_embeddings=4096,  # Maximum number of characters in a single piece of text
    pad_token_id=2,  # ID for padding characters
    bos_token_id=1,  # ID for the start of a sentence
    eos_token_id=2  # ID for the end of a sentence
)


In [None]:

# Creating the model with the above configuration
model = MistralForCausalLM(configuration)

# Loading a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", local_files_only=False)

# Setting the tokenizer's padding character to be the same as its end-of-sentence character
tokenizer.pad_token = tokenizer.eos_token


In [None]:

# Loading the dataset we want to train on
dataset = load_dataset('HuggingFaceTB/cosmopedia-20k', split="train")

# Randomly shuffling the dataset
dataset = dataset.shuffle(seed=42)

# Printing out some basic information about the dataset
print(f'Number of prompts: {len(dataset)}')  # Total number of pieces of text
print(f'Column names are: {dataset.column_names}')  # Names of the columns in the dataset


In [None]:

# Function to prepare each piece of text in the dataset for the model
def create_prompt_formats(sample):
    output_texts = []
    for i in range(len(sample['text'])):
        formatted_prompt = sample['text'][i]
        output_texts.append(formatted_prompt)
    return output_texts


In [None]:

# Setting up the trainer with the model, dataset, and other configurations
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    max_seq_length=2048,  # Maximum length of text chunks for the model
    formatting_func=create_prompt_formats,  # Function to format the data
    args=TrainingArguments(
        per_device_train_batch_size=2,  # How many texts to process at once
        gradient_accumulation_steps=1,  # How often to update the model weights
        warmup_steps=2,  # Initial phase where the model learns slowly
        max_steps=2000,  # Total number of updates to the model
        learning_rate=1e-4,  # Speed of learning
        logging_steps=1,  # How often to log progress
        output_dir="M_outputs",  # Where to save the results
        overwrite_output_dir=True,  # Whether to overwrite existing results
        save_steps=1000,  # How often to save the current state of the model
        optim="paged_adamw_32bit",  # Which method to use for updating the model weights
        report_to="none"  # Where to send the training reports
    )
)


In [None]:
trainer.train()
trainer.save_model(my_model)


In [None]:
model.push_to_hub(my_model, use_temp_dir=False)
tokenizer.push_to_hub(my_model, use_temp_dir=False)