In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Step 1: Install dependencies
!pip install transformers accelerate --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**1.SYNTHETIC DATASET CREATION**


In [1]:
import pandas as pd
import random

# Diverse business types with varied industries
business_types = [
    "Organic Coffee Shop",
    "Fitness Coaching Platform",
    "AI-Powered Resume Builder",
    "Pet Grooming Studio",
    "Virtual Language Tutor",
    "Handmade Jewelry Store",
    "Sustainable Fashion Brand",
    "Remote Work Tools SaaS",
    "Luxury Skincare Line",
    "Eco-Friendly Home Cleaning Service"
]

complexity_levels = ["Low", "Medium", "High"]

def generate_description(btype, complexity):
    base = f"{btype} that "
    if complexity == "Low":
        detail = "caters to a small, local customer base with simple offerings."
    elif complexity == "Medium":
        detail = "is expanding its reach with moderate digital marketing and a growing online presence."
    else:
        detail = "serves international clients with a fully scalable infrastructure and AI-driven analytics."
    return base + detail

def generate_domain(btype):
    clean = btype.lower().replace(" ", "").replace("-", "")
    return f"{clean}{random.randint(100,999)}.com"

# Generate 200 rows
rows = []
for _ in range(200):
    btype = random.choice(business_types)
    complexity = random.choice(complexity_levels)
    description = generate_description(btype, complexity)
    domain = generate_domain(btype)
    rows.append([btype, complexity, description, domain])

df = pd.DataFrame(rows, columns=["Business Type", "Complexity", "Description", "Domain"])
df.to_csv("Expanded_Domain_Dataset.csv", index=False)
df.head()


Unnamed: 0,Business Type,Complexity,Description,Domain
0,Fitness Coaching Platform,Medium,Fitness Coaching Platform that is expanding it...,fitnesscoachingplatform401.com
1,Eco-Friendly Home Cleaning Service,Low,Eco-Friendly Home Cleaning Service that caters...,ecofriendlyhomecleaningservice955.com
2,Sustainable Fashion Brand,Medium,Sustainable Fashion Brand that is expanding it...,sustainablefashionbrand320.com
3,Organic Coffee Shop,High,Organic Coffee Shop that serves international ...,organiccoffeeshop120.com
4,Pet Grooming Studio,Medium,Pet Grooming Studio that is expanding its reac...,petgroomingstudio866.com


**2. MODEL DEPLOYMENT AND ITERATION**

In [2]:
# Load Lightweight LLM (FLAN-T5) and Create Generator

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


model_id = "google/flan-t5-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Create text2text generation pipeline
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Test the generator with a business description
prompt = ("You are a domain name expert. Generate 3 short, brandable, and available domain names "
          "for this business: A plant-based protein bar startup focused on fitness enthusiasts.")

response = generator(prompt, max_new_tokens=60, temperature=0.7)
print(response[0]['generated_text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


proteinbar.com


2.2


In [3]:
# Step 2.2 - Load Dataset and Generate Domain Suggestions

import pandas as pd
from tqdm import tqdm

# Load dataset
df = pd.read_csv("/content/Expanded_Domain_Dataset.csv")

# Display first few rows
print("Dataset Loaded:")
display(df.head())

# Function to create prompt and generate domains
def generate_domains(description):
    prompt = (f"You are a domain name expert. Generate 3 creative, brandable, and short domain names "
              f"for the business described below:\n\n{description}")

    try:
        output = generator(prompt, max_new_tokens=60)[0]['generated_text']

        domains = [d.strip() for d in output.split('\n') if d.strip()]
        domains = [d for d in domains if "." in d]
        suggestions = [{"domain": d, "confidence": 0.85} for d in domains]
        return {
            "suggestions": suggestions,
            "status": "success"
        } if suggestions else {
            "suggestions": [],
            "status": "blocked",
            "message": "No valid domain names found in output"
        }
    except Exception as e:
        return {
            "suggestions": [],
            "status": "error",
            "message": str(e)
        }

# Apply generation to each row
tqdm.pandas()
df["Structured_Suggestions"] = df["Description"].progress_apply(generate_domains)

# Preview results
df.head()


Dataset Loaded:


Unnamed: 0,Business Type,Complexity,Description,Domain
0,Fitness Coaching Platform,Medium,Fitness Coaching Platform that is expanding it...,fitnesscoachingplatform401.com
1,Eco-Friendly Home Cleaning Service,Low,Eco-Friendly Home Cleaning Service that caters...,ecofriendlyhomecleaningservice955.com
2,Sustainable Fashion Brand,Medium,Sustainable Fashion Brand that is expanding it...,sustainablefashionbrand320.com
3,Organic Coffee Shop,High,Organic Coffee Shop that serves international ...,organiccoffeeshop120.com
4,Pet Grooming Studio,Medium,Pet Grooming Studio that is expanding its reac...,petgroomingstudio866.com


  5%|▌         | 10/200 [00:02<00:39,  4.83it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [00:44<00:00,  4.48it/s]


Unnamed: 0,Business Type,Complexity,Description,Domain,Structured_Suggestions
0,Fitness Coaching Platform,Medium,Fitness Coaching Platform that is expanding it...,fitnesscoachingplatform401.com,{'suggestions': [{'domain': 'fitnesscoachingpl...
1,Eco-Friendly Home Cleaning Service,Low,Eco-Friendly Home Cleaning Service that caters...,ecofriendlyhomecleaningservice955.com,{'suggestions': [{'domain': 'Eco-Friendly-Home...
2,Sustainable Fashion Brand,Medium,Sustainable Fashion Brand that is expanding it...,sustainablefashionbrand320.com,{'suggestions': [{'domain': 'sustainablefashio...
3,Organic Coffee Shop,High,Organic Coffee Shop that serves international ...,organiccoffeeshop120.com,{'suggestions': [{'domain': 'organiccoffeeshop...
4,Pet Grooming Studio,Medium,Pet Grooming Studio that is expanding its reac...,petgroomingstudio866.com,"{'suggestions': [], 'status': 'blocked', 'mess..."


2.3


In [4]:
# Step 2.3 - Save the output as versioned dataset

output_path = "/content/Generated_Domain_Suggestions_v1.csv"
df.to_csv(output_path, index=False)

print(f"Domain suggestions saved to: {output_path}")


✅ Domain suggestions saved to: /content/Generated_Domain_Suggestions_v1.csv


3.1

In [5]:
# Step 3.1 - Define heuristic evaluation function for domain names

def evaluate_domain_quality(description, domain):
    """
    Simple heuristic:
    - Reward if domain contains keywords from description
    - Penalize if domain is too long (>25 chars)
    - Basic score between 0 and 1
    """
    if not domain or not isinstance(domain, str):
        return 0.0

    score = 0.5  # Start neutral

    # Check keyword match
    desc_words = set(description.lower().split())
    domain_lower = domain.lower()
    keyword_hits = sum(1 for word in desc_words if word in domain_lower)

    score += 0.05 * keyword_hits

    # Penalize for long domains
    if len(domain) > 25:
        score -= 0.1

    # Cap the score
    return round(min(max(score, 0.0), 1.0), 2)


3.2

In [6]:
# Step 3.2 - Apply scoring function to all domain suggestions

def score_suggestions(row):
    if isinstance(row['Structured_Suggestions'], dict) and row['Structured_Suggestions'].get("suggestions"):
        suggestions = row['Structured_Suggestions']['suggestions']
        return [
            {
                "domain": s["domain"],
                "confidence": evaluate_domain_quality(row['Description'], s["domain"])
            }
            for s in suggestions
        ]
    else:
        return []

# Apply scoring and create a new column
df["Scored_Suggestions"] = df.apply(score_suggestions, axis=1)

# Preview
df[["Description", "Scored_Suggestions"]].head()


Unnamed: 0,Description,Scored_Suggestions
0,Fitness Coaching Platform that is expanding it...,"[{'domain': 'fitnesscoachingplatform.com', 'co..."
1,Eco-Friendly Home Cleaning Service that caters...,[{'domain': 'Eco-Friendly-Home-Cleaning-Servic...
2,Sustainable Fashion Brand that is expanding it...,"[{'domain': 'sustainablefashionbrand.com', 'co..."
3,Organic Coffee Shop that serves international ...,"[{'domain': 'organiccoffeeshop.com', 'confiden..."
4,Pet Grooming Studio that is expanding its reac...,[]


3.3


In [7]:
# Step 3.3 - Save evaluated version

df.to_csv("Domain_Suggestions_with_Scores.csv", index=False)
print("Evaluation scores saved.")


✅ Evaluation scores saved.


4.1


In [8]:
# Step 4.1 - Analyze scored outputs to detect issues

def detect_edge_case(score_list):
    if not score_list or len(score_list) == 0:
        return "No Suggestions"

    issues = []
    for entry in score_list:
        domain = entry['domain']
        conf = entry['confidence']
        if conf < 0.4:
            issues.append("Low Score")
        if len(domain) > 30:
            issues.append("Too Long")
        if domain.count('-') > 2:
            issues.append("Too Many Dashes")

    return ", ".join(set(issues)) if issues else "Pass"

# Apply to dataset
df["Edge_Case_Tag"] = df["Scored_Suggestions"].apply(detect_edge_case)

# Check examples
df[["Description", "Scored_Suggestions", "Edge_Case_Tag"]].head(10)


Unnamed: 0,Description,Scored_Suggestions,Edge_Case_Tag
0,Fitness Coaching Platform that is expanding it...,"[{'domain': 'fitnesscoachingplatform.com', 'co...",Pass
1,Eco-Friendly Home Cleaning Service that caters...,[{'domain': 'Eco-Friendly-Home-Cleaning-Servic...,"Too Many Dashes, Too Long"
2,Sustainable Fashion Brand that is expanding it...,"[{'domain': 'sustainablefashionbrand.com', 'co...",Pass
3,Organic Coffee Shop that serves international ...,"[{'domain': 'organiccoffeeshop.com', 'confiden...",Pass
4,Pet Grooming Studio that is expanding its reac...,[],No Suggestions
5,"Pet Grooming Studio that caters to a small, lo...","[{'domain': 'petgroomingstudio.com', 'confiden...",Pass
6,Fitness Coaching Platform that is expanding it...,"[{'domain': 'fitnesscoachingplatform.com', 'co...",Pass
7,Fitness Coaching Platform that serves internat...,"[{'domain': 'fitnesscoachingplatform.com', 'co...",Pass
8,Eco-Friendly Home Cleaning Service that serves...,[{'domain': 'eco-friendlyhomecleaningservice.c...,Too Long
9,Fitness Coaching Platform that serves internat...,"[{'domain': 'fitnesscoachingplatform.com', 'co...",Pass


4.2


In [9]:
# Step 4.2 - Summarize types of failures

edge_case_summary = df["Edge_Case_Tag"].value_counts()
print("Edge Case Frequency Summary:")
print(edge_case_summary)


🔍 Edge Case Frequency Summary:
Edge_Case_Tag
Pass                         166
Too Long                      19
No Suggestions                 9
Too Many Dashes, Too Long      6
Name: count, dtype: int64


4.3

In [10]:
# Step 4.3 - Save version with edge case tags

df.to_csv("Domain_Suggestions_with_Scores_and_Edges.csv", index=False)
print("Edge case tagging saved.")


✅ Edge case tagging saved.


5.1

In [11]:
# Step 5.1 - Define inappropriate content checker

import re

INAPPROPRIATE_KEYWORDS = [
    "adult", "nude", "explicit", "gambling", "casino", "violence",
    "drugs", "weapon", "terror", "hate", "porn", "escort"
]

def is_safe(description):
    text = description.lower()
    for word in INAPPROPRIATE_KEYWORDS:
        if re.search(rf"\b{word}\b", text):
            return False
    return True

# Apply safety check
df["is_safe"] = df["Description"].apply(is_safe)
df["safety_status"] = df["is_safe"].apply(lambda x: "blocked" if not x else "safe")

# Preview
df[["Description", "safety_status"]].head(10)


Unnamed: 0,Description,safety_status
0,Fitness Coaching Platform that is expanding it...,safe
1,Eco-Friendly Home Cleaning Service that caters...,safe
2,Sustainable Fashion Brand that is expanding it...,safe
3,Organic Coffee Shop that serves international ...,safe
4,Pet Grooming Studio that is expanding its reac...,safe
5,"Pet Grooming Studio that caters to a small, lo...",safe
6,Fitness Coaching Platform that is expanding it...,safe
7,Fitness Coaching Platform that serves internat...,safe
8,Eco-Friendly Home Cleaning Service that serves...,safe
9,Fitness Coaching Platform that serves internat...,safe


5.2

In [12]:
# Step 5.2 - Cleanly block unsafe generations

def enforce_safety(row):
    if not row["is_safe"]:
        return {
            "suggestions": [],
            "status": "blocked",
            "message": "Request contains inappropriate content"
        }
    return row["Structured_Suggestions"]

df["Final_Suggestions"] = df.apply(enforce_safety, axis=1)

# Check example of blocked + allowed
df[["Description", "safety_status", "Final_Suggestions"]].head(10)


Unnamed: 0,Description,safety_status,Final_Suggestions
0,Fitness Coaching Platform that is expanding it...,safe,{'suggestions': [{'domain': 'fitnesscoachingpl...
1,Eco-Friendly Home Cleaning Service that caters...,safe,{'suggestions': [{'domain': 'Eco-Friendly-Home...
2,Sustainable Fashion Brand that is expanding it...,safe,{'suggestions': [{'domain': 'sustainablefashio...
3,Organic Coffee Shop that serves international ...,safe,{'suggestions': [{'domain': 'organiccoffeeshop...
4,Pet Grooming Studio that is expanding its reac...,safe,"{'suggestions': [], 'status': 'blocked', 'mess..."
5,"Pet Grooming Studio that caters to a small, lo...",safe,{'suggestions': [{'domain': 'petgroomingstudio...
6,Fitness Coaching Platform that is expanding it...,safe,{'suggestions': [{'domain': 'fitnesscoachingpl...
7,Fitness Coaching Platform that serves internat...,safe,{'suggestions': [{'domain': 'fitnesscoachingpl...
8,Eco-Friendly Home Cleaning Service that serves...,safe,{'suggestions': [{'domain': 'eco-friendlyhomec...
9,Fitness Coaching Platform that serves internat...,safe,{'suggestions': [{'domain': 'fitnesscoachingpl...


5.3

In [13]:
# Step 5.3 - Final save with safety

df.to_csv("Final_Domain_Suggestions_With_Safety.csv", index=False)
print("Final suggestions file saved with safety filtering applied.")


✅ Final suggestions file saved with safety filtering applied.


In [9]:
import pandas as pd

# Load Flan-T5 final output
flan_df = pd.read_csv("Final_Domain_Suggestions_With_Safety.csv")

# Count total rows
total = len(flan_df)

# Count how many suggestions are present
flan_df["Num_Domains"] = flan_df["Final_Suggestions"].apply(
    lambda x: len(eval(x)["suggestions"]) if pd.notnull(x) and "suggestions" in str(x) else 0
)

# Count how many are .com
flan_df["Num_Com_Domains"] = flan_df["Final_Suggestions"].apply(
    lambda x: sum(1 for d in eval(x)["suggestions"] if ".com" in d["domain"]) if pd.notnull(x) else 0
)

# Average domain length
flan_df["Avg_Length"] = flan_df["Final_Suggestions"].apply(
    lambda x: sum(len(d["domain"]) for d in eval(x)["suggestions"]) / len(eval(x)["suggestions"])
    if pd.notnull(x) and len(eval(x)["suggestions"]) > 0 else 0
)

# Uniqueness: how many rows have completely unique domain suggestions
unique_rows = flan_df["Final_Suggestions"].apply(lambda x: str(x)).nunique()

# Display metrics
print(" Flan-T5 Evaluation Summary:")
print(f"Avg # Domains Generated: {flan_df['Num_Domains'].mean():.2f}")
print(f"Avg .com Domains: {flan_df['Num_Com_Domains'].mean():.2f}")
print(f"Avg Domain Length: {flan_df['Avg_Length'].mean():.2f}")
print(f"% Unique Domain Rows: {(unique_rows / total) * 100:.2f}")

# Optional: Save
flan_df.to_csv("flan_t5_domain_generation_eval.csv", index=False)
print("Flan-T5 Evaluation saved to flan_t5_domain_generation_eval.csv")


📊 Flan-T5 Evaluation Summary:
Avg # Domains Generated: 0.95
Avg .com Domains: 0.95
Avg Domain Length: 24.44
% Unique Domain Rows: 7.50
✅ Flan-T5 Evaluation saved to flan_t5_domain_generation_eval.csv


gptq model

1. Dataset creation

In [14]:
import pandas as pd
import random

business_types = [
    "Fitness Coaching Platform",
    "Eco-Friendly Home Cleaning Service",
    "Sustainable Fashion Brand",
    "Organic Coffee Shop",
    "Pet Grooming Studio",
    "Luxury Skincare Line",
    "AI Resume Builder",
    "Online Coding Bootcamp",
    "Virtual Language Tutor",
    "Green Energy Consultancy"
]

complexity_levels = ["Low", "Medium", "High"]

def generate_description(business_type, complexity):
    if complexity == "Low":
        return f"{business_type} that caters to a small, local customer base with simple offerings."
    elif complexity == "Medium":
        return f"{business_type} that is expanding its reach with moderate digital marketing and a growing online presence."
    else:
        return f"{business_type} that serves international clients with a fully scalable infrastructure and AI-driven analytics."

data = []
for i in range(200):
    business_type = random.choice(business_types)
    complexity = random.choice(complexity_levels)
    description = generate_description(business_type, complexity)
    data.append({"Business Type": business_type, "Complexity": complexity, "Description": description})

df = pd.DataFrame(data)
df.to_csv("synthetic_business_dataset.csv", index=False)
df.head()


Unnamed: 0,Business Type,Complexity,Description
0,AI Resume Builder,High,AI Resume Builder that serves international cl...
1,Fitness Coaching Platform,Medium,Fitness Coaching Platform that is expanding it...
2,Virtual Language Tutor,Low,"Virtual Language Tutor that caters to a small,..."
3,Luxury Skincare Line,Medium,Luxury Skincare Line that is expanding its rea...
4,Virtual Language Tutor,Low,"Virtual Language Tutor that caters to a small,..."


In [16]:
!pip install -q optimum


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/425.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m276.5/425.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m425.8/425.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q auto-gptq


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m123.8 MB/s[0m eta [36m0:00:00[0m
[?25h

**2. MODEL**

In [1]:


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_id = "TheBloke/Nous-Capybara-7B-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
print("Model loaded and ready.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Nous-Capybara-7B-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 

generation_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Model loaded and ready.


In [2]:
!pip install -q peft datasets bitsandbytes accelerate trl


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/504.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/504.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m501.8/504.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

2.1


In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

#  Load model
model_id = "TheBloke/Nous-Capybara-7B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Load synthetic dataset
df_synthetic = pd.read_csv("/content/synthetic_business_dataset.csv")

# Sample 50 rows for faster testing
df_sample = df_synthetic.sample(n=50, random_state=42).reset_index(drop=True)

# Define prompt template
def build_prompt(description):
    return f"""You are a domain name generator expert.

Task: Generate 5 creative, catchy, and brandable domain names that reflect the business idea.

Guidelines:
- Use short, memorable, .com domain names
- Avoid repetition
- Focus on uniqueness and creativity

Business Description: {description}

Domain Names:"""

# Generate domain names
generated_domains = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    prompt = build_prompt(row['Description'])
    outputs = generator(prompt, max_new_tokens=60, do_sample=True, temperature=0.7)
    text_output = outputs[0]["generated_text"].split("Domain Names:")[-1].strip()
    generated_domains.append(text_output)

# Save back to DataFrame
df_sample['Domain'] = generated_domains

#  View sample
df_sample.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Nous-Capybara-7B-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.dow

Unnamed: 0,Business Type,Complexity,Description,Domain
0,Luxury Skincare Line,Medium,Luxury Skincare Line that is expanding its rea...,1. FlawlessCleanse.com\n2. GlowingAura.com\n3....
1,Fitness Coaching Platform,Low,Fitness Coaching Platform that caters to a sma...,1. **FitClaveLocalCoach.com**\n2. **TrainSprin...
2,Virtual Language Tutor,Medium,Virtual Language Tutor that is expanding its r...,1. **LinguaSphereOnline**\n2. **TutorMandarinC...
3,Luxury Skincare Line,Low,"Luxury Skincare Line that caters to a small, l...",1. ArcticSpaCare.com\n2. GlowingLuxe.com\n3. S...
4,Fitness Coaching Platform,Medium,Fitness Coaching Platform that is expanding it...,1. FitnessBuddyCoach.com\n2. TrainWithTony.com...


In [5]:
import re

def parse_domains(raw_output):
    if pd.isna(raw_output):
        return []
    lines = raw_output.strip().split("\n")
    domains = []
    for line in lines:
        line = re.sub(r"^\s*[\-•\d\.\)]*\s*", "", line).strip()  # Strip list markers
        if line and "." in line and len(line) < 40:
            domains.append(line)
    return domains[:5]  # Limit to 5 max

df_sample['Structured_Suggestions'] = df_sample['Domain'].apply(parse_domains)


In [6]:
df_sample.to_csv("gptq_step2.3_structured_output.csv", index=False)
print("Step 2.3 complete. Parsed and structured domain suggestions saved.")


✅ Step 2.3 complete. Parsed and structured domain suggestions saved.


AUGMENTATION



In [7]:
from random import choice

def augment_description(desc):
    templates = [
        f"{desc} with a modern twist.",
        f"A business idea: {desc}",
        f"{desc} aiming to disrupt the market.",
        f"{desc} tailored for Gen Z audiences.",
        f"{desc}, looking to scale globally."
    ]
    return choice(templates)

# Augment 3x for each row
augmented_rows = []

for _, row in df_sample.iterrows():
    for _ in range(3):  # generate 3 variations
        new_row = row.copy()
        new_row["Description"] = augment_description(row["Description"])
        augmented_rows.append(new_row)

df_augmented = pd.DataFrame(augmented_rows)

# Combine original + augmented
df_combined = pd.concat([df_sample, df_augmented], ignore_index=True)
print("Data augmentation complete. Total rows after augmentation:", len(df_combined))


✅ Data augmentation complete. Total rows after augmentation: 200


In [8]:
df_combined.to_csv("augmented_dataset_step2.4.csv", index=False)
print(" Saved: augmented_dataset_step2.4.csv")


💾 Saved: augmented_dataset_step2.4.csv


In [9]:
!pip install peft bitsandbytes accelerate trl




In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
model_id = "TheBloke/Nous-Capybara-7B-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Load augmented data
df = pd.read_csv("augmented_dataset_step2.4.csv")

# Format prompt
def format_prompt(description):
    return f"""You are a domain name expert.

Task: Generate 5 creative, short, brandable .com domain names for the following business:

Business Description: {description}

Domain Names:"""

df['prompt'] = df['Description'].apply(format_prompt)
df['output'] = ""

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df[["prompt", "output"]])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
def tokenize_function(example):
    tokenized = tokenizer(
        example["prompt"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "output"])
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import torch


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

# Apply LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Nous-Capybara-7B-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers

In [16]:
!pip install -U transformers


Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.54.0
    Uninstalling transformers-4.54.0:
      Successfully uninstalled transformers-4.54.0
Successfully installed transformers-4.54.1


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora-domain-model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    learning_rate=2e-5,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

trainer.train()


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 36.12 MiB is free. Process 344498 has 14.70 GiB memory in use. Of the allocated memory 14.12 GiB is allocated by PyTorch, and 470.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

**3. EVALUATION**

In [8]:
import pandas as pd

# Load your output CSV
df = pd.read_csv("/content/gptq_step2.3_structured_output.csv")

# Basic metrics
def evaluate_domains(row):
    domains = row['Domain'].split('\n')
    domains = [d.strip().lower() for d in domains if d.strip()]
    metrics = {
        "num_domains": len(domains),
        "num_valid_com": sum(".com" in d for d in domains),
        "avg_length": sum(len(d) for d in domains) / len(domains) if domains else 0,
        "unique_domains": len(set(domains)) == len(domains)
    }
    return pd.Series(metrics)

metrics_df = df.apply(evaluate_domains, axis=1)

# Merge metrics
df_eval = pd.concat([df, metrics_df], axis=1)

# Aggregate results
summary = {
    "Avg # Domains Generated": metrics_df["num_domains"].mean(),
    "Avg .com Domains": metrics_df["num_valid_com"].mean(),
    "Avg Domain Length": metrics_df["avg_length"].mean(),
    "% Unique Domains Rows": (metrics_df["unique_domains"].sum() / len(metrics_df)) * 100
}

print("📊 Evaluation Summary:")
for k, v in summary.items():
    print(f"{k}: {v:.2f}")

# Save evaluation result
df_eval.to_csv("gptq_domain_generation_eval.csv", index=False)
print("Evaluation saved to gptq_domain_generation_eval.csv")


📊 Evaluation Summary:
Avg # Domains Generated: 4.90
Avg .com Domains: 1.92
Avg Domain Length: 19.55
% Unique Domains Rows: 100.00
✅ Evaluation saved to gptq_domain_generation_eval.csv


**API ENDPOINT**

In [2]:
pip install flask flask-cors transformers torch


Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

In [3]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

#  Load lightweight model
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

#  Basic safety keywords
INAPPROPRIATE_KEYWORDS = [
    "adult", "nude", "explicit", "porn", "escort", "violence", "weapon", "drugs", "casino", "hate"
]

# Initialize API
app = Flask(__name__)
CORS(app)

@app.route("/generate", methods=["POST"])
def generate_domains():
    data = request.get_json()
    description = data.get("business_description", "")

    # Safety Check
    if any(word in description.lower() for word in INAPPROPRIATE_KEYWORDS):
        return jsonify({
            "suggestions": [],
            "status": "blocked",
            "message": "Request contains inappropriate content"
        })

    # Prompt
    prompt = f"You are a domain name expert. Generate 3 brandable, .com domain names for this business:\n{description}"

    try:
        output = generator(prompt, max_new_tokens=60)[0]["generated_text"]
        domains = [d.strip().lstrip("-•1234567890. ").strip() for d in output.split("\n") if "." in d]
        domains = [d for d in domains if ".com" in d]
        suggestions = [{"domain": d, "confidence": 0.85} for d in domains[:3]]

        return jsonify({
            "suggestions": suggestions,
            "status": "success"
        })

    except Exception as e:
        return jsonify({
            "suggestions": [],
            "status": "error",
            "message": str(e)
        })

#  Run the server
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:7860
 * Running on http://172.28.0.12:7860
INFO:werkzeug:[33mPress CTRL+C to quit[0m
