In [2]:
!pip install -q transformers datasets evaluate sacrebleu

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np

In [4]:
df = pd.read_csv("/content/email_formalizer_5000.csv")[["input_text", "target_text"]].dropna()

df.head()

Unnamed: 0,input_text,target_text
0,what's the update?,Could you please provide an update?
1,can't make it today,I will be unable to attend today.
2,lemme know,Please let me know.
3,don't know,I am not certain.
4,don't know,I am not certain.


In [5]:
df = df.rename(columns={"input_text": "informal_text", "target_text": "formal_text"})

df = df.sample(n=5000, random_state=42) if len(df) > 5000 else df

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
test_ds = dataset["test"]

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = ["formalize: " + text for text in examples["informal_text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples["formal_text"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": result["score"]}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./t5-email-formalizer",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    save_steps=500
)


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [11]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcgamers0987[0m ([33mcgamers0987-vit-bhopal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,7.4154
20,3.0615
30,1.4411
40,1.1996
50,0.7483
60,0.4691
70,0.3483
80,0.2974
90,0.2577
100,0.2237


TrainOutput(global_step=1689, training_loss=0.11608159959717676, metrics={'train_runtime': 16077.2595, 'train_samples_per_second': 0.84, 'train_steps_per_second': 0.105, 'total_flos': 456778579968000.0, 'train_loss': 0.11608159959717676, 'epoch': 3.0})

In [12]:
def formalize_email(text):
    input_ids = tokenizer("formalize: " + text, return_tensors="pt", padding=True, truncation=True).input_ids
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Sample Inputs
examples = [
    "sorry I can't make it to the meeting",
    "got ur msg. will reply soon",
    "hey, send me the file asap!",
    "can't attend class today, not feeling well"
]

for text in examples:
    print(f"Informal: {text}")
    print(f"Formal  : {formalize_email(text)}\n")


Informal: sorry I can't make it to the meeting
Formal  : I am unable to attend the meeting.

Informal: got ur msg. will reply soon
Formal  : I will reply soon.

Informal: hey, send me the file asap!
Formal  : Please send me the file asap!

Informal: can't attend class today, not feeling well
Formal  : I am unable to attend, I am unable to attend today.



In [13]:
model.save_pretrained("/content/t5-email-formalizer")
tokenizer.save_pretrained("/content/t5-email-formalizer")

('/content/t5-email-formalizer/tokenizer_config.json',
 '/content/t5-email-formalizer/special_tokens_map.json',
 '/content/t5-email-formalizer/spiece.model',
 '/content/t5-email-formalizer/added_tokens.json')

In [14]:
from google.colab import files
!zip -r t5-email-formalizer.zip t5-email-formalizer
files.download("t5-email-formalizer.zip")

  adding: t5-email-formalizer/ (stored 0%)
  adding: t5-email-formalizer/model.safetensors (deflated 11%)
  adding: t5-email-formalizer/config.json (deflated 63%)
  adding: t5-email-formalizer/generation_config.json (deflated 29%)
  adding: t5-email-formalizer/added_tokens.json (deflated 83%)
  adding: t5-email-formalizer/spiece.model (deflated 48%)
  adding: t5-email-formalizer/special_tokens_map.json (deflated 85%)
  adding: t5-email-formalizer/checkpoint-1689/ (stored 0%)
  adding: t5-email-formalizer/checkpoint-1689/model.safetensors (deflated 11%)
  adding: t5-email-formalizer/checkpoint-1689/scheduler.pt (deflated 56%)
  adding: t5-email-formalizer/checkpoint-1689/config.json (deflated 63%)
  adding: t5-email-formalizer/checkpoint-1689/generation_config.json (deflated 29%)
  adding: t5-email-formalizer/checkpoint-1689/added_tokens.json (deflated 83%)
  adding: t5-email-formalizer/checkpoint-1689/rng_state.pth (deflated 24%)
  adding: t5-email-formalizer/checkpoint-1689/training_a

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
df.to_csv("email_formalizer_used.csv", index=False)
files.download("email_formalizer_used.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
!unzip t5-email-formalizer.zip


Archive:  t5-email-formalizer.zip
   creating: t5-email-formalizer/
  inflating: t5-email-formalizer/model.safetensors  
  inflating: t5-email-formalizer/config.json  
  inflating: t5-email-formalizer/generation_config.json  
  inflating: t5-email-formalizer/added_tokens.json  
  inflating: t5-email-formalizer/spiece.model  
  inflating: t5-email-formalizer/special_tokens_map.json  
   creating: t5-email-formalizer/checkpoint-1689/
  inflating: t5-email-formalizer/checkpoint-1689/model.safetensors  
  inflating: t5-email-formalizer/checkpoint-1689/scheduler.pt  
  inflating: t5-email-formalizer/checkpoint-1689/config.json  
  inflating: t5-email-formalizer/checkpoint-1689/generation_config.json  
  inflating: t5-email-formalizer/checkpoint-1689/added_tokens.json  
  inflating: t5-email-formalizer/checkpoint-1689/rng_state.pth  
  inflating: t5-email-formalizer/checkpoint-1689/training_args.bin  
  inflating: t5-email-formalizer/checkpoint-1689/spiece.model  
  inflating: t5-email-forma

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("/content/t5-email-formalizer")
tokenizer = T5Tokenizer.from_pretrained("/content/t5-email-formalizer")

In [4]:
def formalize_email(informal_text):
    input_text = "formalize: " + informal_text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    formal_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return formal_text

In [5]:
# Example informal text
informal = "sorry i cant come to the meeting"
formal = formalize_email(informal)
print("Formal Email:\n", formal)

Formal Email:
 I apologize for the delay.


In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "./t5-email-formalizer"  # or the path where you saved the model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [7]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.7-py3-none-any.whl (23 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79

In [8]:
!ngrok config add-authtoken 2woKgI9d2dNLwtAsKbqQWeqPA7b_2P9RL9hZe6Pb2Gs5QroF

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [9]:
%%writefile app.py
import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer

@st.cache_resource
def load_model():
    model = T5ForConditionalGeneration.from_pretrained("./t5-email-formalizer")
    tokenizer = T5Tokenizer.from_pretrained("./t5-email-formalizer")
    return model, tokenizer

model, tokenizer = load_model()

st.title("📧 Email Formalizer")
input_text = st.text_area("Enter informal email/text:", height=150)

if st.button("Formalize"):
    input_ids = tokenizer.encode("formalize: " + input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    formal_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    st.success("Formalized Output:")
    st.write(formal_output)


Writing app.py


In [10]:
# Run Streamlit
import os
os.system('streamlit run app.py &')

# Connect ngrok
from pyngrok import ngrok
public_url = ngrok.connect(addr=8501, proto="http")
print("🔗 Streamlit app is live at:", public_url)


🔗 Streamlit app is live at: NgrokTunnel: "https://3368-34-32-185-226.ngrok-free.app" -> "http://localhost:8501"


In [11]:
!git config --global user.name "Venkatreddy111"
!git config --global user.email "venkatreddypasam4@gmail.com"


In [12]:
!git clone https://github.com/Venkatreddy111/Email-formalizer.git


Cloning into 'Email-formalizer'...


In [13]:
!cp /content/app.py  # Adjust paths as needed


cp: missing destination file operand after '/content/app.py'
Try 'cp --help' for more information.
