In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

# Load dataset
df = pd.read_csv('lesson_plans_dataset.csv')

# Creating input_text for the model
df['input_text'] = df.apply(lambda row: f"Generate lesson plan for {row['board']}, Grade {row['grade']}, {row['subject']}, {row['unit']}, {row['chapter']}, Topics: {row['topics']}, {row['sessionType']} session, {row['noOfSessions']} sessions, {row['duration']} mins", axis=1)

# Train-Test Split (80% Train, 20% Test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

# Tokenizer and Model Initialization
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Preprocessing the Data
def preprocess_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=512, truncation=True)
    labels = tokenizer(examples['lesson_plan'], max_length=512, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset, DatasetDict
import pandas as pd
import os

# ✅ Disable Weights & Biases (W&B) Logging
os.environ["WANDB_DISABLED"] = "true"

# ✅ Load Dataset
df = pd.read_csv('lesson_plans_dataset.csv')

# ✅ Prepare Input Texts
df['input_text'] = df.apply(lambda row: f"Generate lesson plan for {row['board']}, Grade {row['grade']}, {row['subject']}, {row['unit']}, {row['chapter']}, Topics: {row['topics']}, {row['sessionType']} session, {row['noOfSessions']} sessions, {row['duration']} mins", axis=1)

# ✅ Train-Test Split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# ✅ Convert to Hugging Face Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

# ✅ Tokenizer and Model Initialization
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# ✅ Preprocessing the Data with Padding and Truncation
def preprocess_function(examples):
    inputs = tokenizer(
        examples['input_text'],
        max_length=512,          # Max token length
        padding='max_length',    # Pad to max_length
        truncation=True          # Truncate sequences longer than max_length
    )
    labels = tokenizer(
        examples['lesson_plan'],
        max_length=512,
        padding='max_length',    # Pad labels to max_length
        truncation=True          # Truncate long labels
    )
    inputs['labels'] = labels['input_ids']
    return inputs

# ✅ Apply the preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# ✅ Data Collator for Dynamic Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ Training Arguments with Evaluation and Save Strategy Aligned
training_args = TrainingArguments(
    output_dir="./results",            # Directory to save model checkpoints
    evaluation_strategy="epoch",       # Evaluate after every epoch
    save_strategy="epoch",             # Save model after every epoch (aligned with eval)
    learning_rate=2e-5,                # Learning rate for the optimizer
    per_device_train_batch_size=4,     # Batch size for training
    per_device_eval_batch_size=4,      # Batch size for evaluation
    num_train_epochs=5,                # Number of epochs to train
    weight_decay=0.01,                 # Regularization to prevent overfitting
    save_total_limit=2,                # Keep only the latest 2 saved models
    load_best_model_at_end=True        # Automatically load the best model at the end
)

# ✅ Trainer Setup with Data Collator
trainer = Trainer(
    model=model,                                # The model to be trained
    args=training_args,                         # Training arguments defined above
    train_dataset=tokenized_datasets['train'],  # Training dataset
    eval_dataset=tokenized_datasets['test'],    # Evaluation dataset
    data_collator=data_collator                 # Data Collator for dynamic padding
)

# ✅ Train the Model
trainer.train()



Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.402198
2,No log,0.137093
3,No log,0.054002
4,No log,0.031161
5,0.765000,0.026062


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=550, training_loss=0.7039375409212979, metrics={'train_runtime': 13813.2268, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.04, 'total_flos': 297751963238400.0, 'train_loss': 0.7039375409212979, 'epoch': 5.0})

In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import os

# Check if the model directory exists
print(os.listdir('.'))  # Lists all files and folders in the current directory


['.config', 'results', 'lesson_plans_dataset.csv', 'sample_data']


In [None]:
# ✅ Re-save the trained model and tokenizer
model.save_pretrained('lesson_plan_model')  # Removed './' for compatibility
tokenizer.save_pretrained('lesson_plan_model')


('lesson_plan_model/tokenizer_config.json',
 'lesson_plan_model/special_tokens_map.json',
 'lesson_plan_model/spiece.model',
 'lesson_plan_model/added_tokens.json')

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# ✅ Correct path without './'
model = T5ForConditionalGeneration.from_pretrained('lesson_plan_model')
tokenizer = T5Tokenizer.from_pretrained('lesson_plan_model')


In [None]:
def generate_lesson_plan(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
import evaluate

# ✅ Load BLEU Metric
bleu_metric = evaluate.load("bleu")

# ✅ Generate Predictions and References
predictions = []
references = []

for example in test_df.sample(10).to_dict(orient='records'):  # Using 10 samples
    input_text = example['input_text']
    expected_output = example['lesson_plan']

    generated_output = generate_lesson_plan(input_text)

    predictions.append(generated_output)
    references.append(expected_output)

# ✅ Compute BLEU Score (Correct Format)
bleu_score = bleu_metric.compute(
    predictions=predictions,
    references=[[ref] for ref in references]   # Wrapping each reference in a list
)

# ✅ Display BLEU Score
print(f"BLEU Score: {bleu_score['bleu']}")


BLEU Score: 0.10129698533106664


In [None]:
!pip install fastapi uvicorn


Collecting fastapi
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Downloading fastapi-0.115.8-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.45.3-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.8 starlette-0.45.3 uvicorn-0.34.0


In [None]:
!pip install fastapi uvicorn pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
# app.py

from fastapi import FastAPI
from pydantic import BaseModel
from transformers import T5ForConditionalGeneration, T5Tokenizer

# ✅ Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('lesson_plan_model')
tokenizer = T5Tokenizer.from_pretrained('lesson_plan_model')

# ✅ Initialize FastAPI
app = FastAPI()

# ✅ Define the request format using Pydantic
class LessonPlanRequest(BaseModel):
    input_text: str

# ✅ Lesson plan generation function
def generate_lesson_plan(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=300)
    outputs = model.generate(inputs['input_ids'], max_length=300, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ API Endpoint to Generate Lesson Plan
@app.post("/generate-lesson-plan")
async def generate_plan(request: LessonPlanRequest):
    output = generate_lesson_plan(request.input_text)
    return {"lesson_plan": output}


In [None]:
!uvicorn app:app --reload



/bin/bash: line 1: uvicorn: command not found
