In [1]:
import random
import pandas as pd
from collections import defaultdict

In [2]:
# Load and prepare data
segmented_url_path = '../Generation/segmented_url.csv'
segmented_url_data = pd.read_csv(segmented_url_path)

# Prepare domain data for fallback
domain_data = segmented_url_data.dropna(subset=["subdomain", "sld"])
domain_data = domain_data.apply(lambda row: f"{row['subdomain']}.{row['sld']}", axis=1).tolist()

# Extract components for Markov models
subdomains = segmented_url_data['subdomain'].dropna().astype(str).tolist()
slds = segmented_url_data['sld'].dropna().astype(str).tolist()

# Markov chain configuration
MARKOV_ORDER = 10  # Higher order for better pattern recognition
MIN_LENGTH = 3    # Minimum length for generated components


In [3]:

def build_markov_chain(strings, order=3):
    """Build a Markov chain model from a list of strings"""
    model = defaultdict(lambda: defaultdict(int))
    for s in strings:
        padded = '^' * order + s + '$'
        for i in range(len(padded) - order):
            current_state = padded[i:i+order]
            next_char = padded[i+order]
            model[current_state][next_char] += 1
    return model


In [4]:

def generate_component(model, order=3, max_length=25):
    """Generate a new component using the Markov chain"""
    state = '^' * order
    result = []
    while True:
        next_char = random.choices(
            list(model[state].keys()),
            weights=list(model[state].values())
        )[0] if model[state] else None

        if not next_char or next_char == '$' or len(result) >= max_length:
            break

        result.append(next_char)
        state = state[1:] + next_char

    generated = ''.join(result)
    return generated if len(generated) >= MIN_LENGTH else None

In [5]:
subdomain_chain = build_markov_chain(subdomains, MARKOV_ORDER)
sld_chain = build_markov_chain(slds, MARKOV_ORDER)

def generate_domain():
    """Generate a domain name using Markov chains"""
    # Attempt Markov generation
    subdomain = generate_component(subdomain_chain, MARKOV_ORDER)
    sld = generate_component(sld_chain, MARKOV_ORDER)

    # Fallback to dataset if generation fails
    if not subdomain or not sld:
        fallback = random.choice(domain_data).split('.', 1)
        return f"{fallback[0]}.{fallback[1]}"

    return f"{subdomain}.{sld}"

In [6]:

def generate_url():
    """Generate a complete URL with Markov-generated domain"""
    protocol = random.choice(["http", "https"])
    domain = generate_domain()
    subdomain, sld = domain.split(".", 1)

    # URL components
    tld = random.choice(['com', 'org', 'net', 'io', 'gov', 'edu', 'xyz', 'info'])
    port = f":{random.choice([80, 443, 8080])}" if random.random() > 0.8 else ""

    # Path components
    path_parts = []
    for _ in range(random.randint(0, 3)):
        path_parts.append(random.choice(['home', 'about', 'products', 'services', 'contact',
                                       'api', 'data', 'user', 'profile', 'search']))
    path = '/' + '/'.join(path_parts) if path_parts else '/'

    # Query parameters
    query = ""
    if random.random() > 0.6:
        params = [f"{k}={random.randint(1000,9999)}"
                for k in random.sample(['id', 'ref', 'session', 'page'], random.randint(1, 3))]
        query = "?" + "&".join(params)

    # Fragment
    fragment = "#" + random.choice(['section', 'top', 'content', 'main']) if random.random() > 0.7 else ""

    return f"{protocol}://{subdomain}.{sld}.{tld}{port}{path}{query}{fragment}"

In [7]:
# Generate and save URLs
num_urls_to_generate = 2000
synthetic_urls = [generate_url() for _ in range(num_urls_to_generate)]

output_path = '../Generation/synthetic_urls_markov.csv'
pd.DataFrame(synthetic_urls, columns=['url']).to_csv(output_path, index=False)

print(f"Successfully generated {num_urls_to_generate} URLs: {output_path}")

Successfully generated 2000 URLs: ../Generation/synthetic_urls_markov.csv


In [8]:
import random
import pandas as pd
from collections import defaultdict
from urllib.parse import urlparse, parse_qs
import tldextract

class DataDrivenURLGenerator:
    def __init__(self, data_path, markov_order=3):
        self.markov_order = markov_order
        self.data = self._load_and_parse_data(data_path)
        self._build_models()

    def _load_and_parse_data(self, path):
        df = pd.read_csv(path)

        if 'url' in df.columns:
            def parse_url(url):
                parsed = urlparse(url)
                ext = tldextract.extract(url)
                return {
                    'protocol': parsed.scheme or 'https',
                    'subdomain': ext.subdomain,
                    'sld': ext.domain,
                    'tld': ext.suffix,
                    'port': parsed.port,
                    'path': parsed.path,
                    'query': parsed.query,
                    'fragment': parsed.fragment
                }
            url_components = df['url'].apply(parse_url).apply(pd.Series)
            df = pd.concat([df, url_components], axis=1)

        return df.dropna(subset=['sld']).fillna('')

    def _build_models(self):
        # Protocol model
        self.protocols = self._calculate_frequencies(self.data['protocol'])

        # Domain models
        self.subdomain_chain = self._build_markov_chain(
            self.data['subdomain'].astype(str),
            self.markov_order
        )
        self.sld_chain = self._build_markov_chain(
            self.data['sld'].astype(str),
            self.markov_order
        )

        # TLD model
        self.tlds = self._calculate_frequencies(self.data['tld'])

        # Port model
        self.ports = self._calculate_frequencies(
            self.data['port'].astype(str).replace('', 'none')
        )

        # Path model
        path_segments = self.data['path'].str.split('/').explode().replace('', None).dropna()
        self.path_chain = self._build_markov_chain(path_segments, self.markov_order)
        self.path_depth = self._calculate_depth_distribution(
            self.data['path'].str.split('/').apply(lambda x: len([s for s in x if s])))

        # Query model
        self.query_params = self._build_query_model()

        # Fragment model
        self.fragments = self._calculate_frequencies(
            self.data['fragment'].replace('', None).dropna()
        )

    def _build_markov_chain(self, strings, order):
        chain = defaultdict(lambda: defaultdict(int))
        for s in strings:
            padded = '^' * order + s + '$'
            for i in range(len(padded) - order):
                current = padded[i:i+order]
                next_char = padded[i+order]
                chain[current][next_char] += 1
        return chain

    def _calculate_frequencies(self, series):
        counts = series.value_counts()
        return {
            'items': counts.index.tolist(),
            'weights': counts.values.tolist(),
            'most_common': counts.idxmax() if not counts.empty else ''
        }

    def _calculate_depth_distribution(self, series):
        depth_counts = series.value_counts().sort_index()
        return {
            'depths': depth_counts.index.tolist(),
            'weights': depth_counts.values.tolist()
        }

    def _build_query_model(self):
        param_counts = defaultdict(int)
        value_models = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

        for query in self.data['query']:
            if not query:
                continue
            for param, values in parse_qs(query).items():
                param_counts[param] += 1
                for value in values:
                    padded = '^' + value + '$'
                    for i in range(len(padded) - 1):
                        current = padded[i]
                        next_char = padded[i+1]
                        value_models[param][current][next_char] += 1

        return {
            'params': dict(param_counts),
            'values': dict(value_models)
        }

    def _generate_component(self, chain, max_length=25):
        if not chain:
            return ''

        state = '^' * self.markov_order
        result = []
        while True:
            options = chain.get(state, {})
            if not options:
                break

            next_char = random.choices(
                list(options.keys()),
                weights=list(options.values())
            )[0]

            if next_char == '$' or len(result) >= max_length:
                break

            result.append(next_char)
            state = state[1:] + next_char

        return ''.join(result) if len(result) >= 2 else ''

    def _generate_domain(self):
        subdomain = self._generate_component(self.subdomain_chain)
        sld = self._generate_component(self.sld_chain)

        if not sld:
            return random.choice(self.data['sld'].dropna().tolist())

        return f"{subdomain}.{sld}" if subdomain else sld

    def _generate_path(self):
        depth = random.choices(
            self.path_depth['depths'],
            weights=self.path_depth['weights'],
            k=1
        )[0] if self.path_depth['depths'] else 0

        segments = []
        for _ in range(depth):
            segment = self._generate_component(self.path_chain)
            if segment:
                segments.append(segment)

        return '/' + '/'.join(segments) if segments else ''

    def _generate_query(self):
        if not self.query_params['params'] or random.random() > 0.6:
            return ''

        params = random.choices(
            list(self.query_params['params'].keys()),
            weights=list(self.query_params['params'].values()),
            k=random.randint(1, 3)
        )

        param_strings = []
        for param in params:
            value_chain = self.query_params['values'].get(param, {})
            value = self._generate_component(value_chain, max_length=12)
            param_strings.append(f"{param}={value}" if value else param)

        return '?' + '&'.join(param_strings) if param_strings else ''

    def generate_url(self):
        protocol = random.choices(
            self.protocols['items'],
            weights=self.protocols['weights'],
            k=1
        )[0] if self.protocols['items'] else 'https'

        domain = self._generate_domain()
        tld = random.choices(
            self.tlds['items'],
            weights=self.tlds['weights'],
            k=1
        )[0] if self.tlds['items'] else 'com'

        port = ''
        if self.ports['items'] and random.random() < (self.ports['weights'][0] / sum(self.ports['weights'])):
            port = f":{random.choices(self.ports['items'], weights=self.ports['weights'])[0]}"

        path = self._generate_path()
        query = self._generate_query()

        fragment = ''
        if self.fragments['items'] and random.random() < 0.3:
            fragment = "#" + random.choices(
                self.fragments['items'],
                weights=self.fragments['weights']
            )[0]

        return f"{protocol}://{domain}.{tld}{port}{path}{query}{fragment}"
# Usage
generator = DataDrivenURLGenerator('../Generation/segmented_url.csv')
synthetic_urls = [generator.generate_url() for _ in range(2000)]

# Save results
pd.DataFrame(synthetic_urls, columns=['url']).to_csv(
    './data_driven_urls.csv',
    index=False
)

print(f"Generated {len(synthetic_urls)} fully data-driven URLs")

Generated 2000 fully data-driven URLs
