In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Number of samples
n = 10000

# Define genotype combinations for parents
flora_genotypes = {
    'AA': {'gametes': ['A'], 'probabilities': [1.0], 'phenotype': 'Purple Flowers'},
    'Aa': {'gametes': ['A', 'a'], 'probabilities': [0.5, 0.5], 'phenotype': 'Purple Flowers'},
    'aa': {'gametes': ['a'], 'probabilities': [1.0], 'phenotype': 'White Flowers'},
    'AABB': {'gametes': ['AB', 'Ab', 'aB', 'ab'], 'probabilities': [0.25, 0.25, 0.25, 0.25], 'phenotype': 'Purple Round'},
    'AaBb': {'gametes': ['AB', 'Ab', 'aB', 'ab'], 'probabilities': [0.25, 0.25, 0.25, 0.25], 'phenotype': 'Purple Round'},
    'AAbb': {'gametes': ['Ab'], 'probabilities': [1.0], 'phenotype': 'Purple Wrinkled'},
    'aaBB': {'gametes': ['aB'], 'probabilities': [1.0], 'phenotype': 'White Round'},
    'aabb': {'gametes': ['ab'], 'probabilities': [1.0], 'phenotype': 'White Wrinkled'}
}

fauna_genotypes = {
    'EETT': {'gametes': ['ET'], 'probabilities': [1.0], 'phenotype': 'TailShort,Brown'},
    'EETt': {'gametes': ['ET', 'Et'], 'probabilities': [0.5, 0.5], 'phenotype': 'TailShort,Brown'},
    'EeTT': {'gametes': ['ET', 'eT'], 'probabilities': [0.5, 0.5], 'phenotype': 'TailShort,Brown'},
    'EeTt': {'gametes': ['ET', 'Et', 'eT', 'et'], 'probabilities': [0.25, 0.25, 0.25, 0.25], 'phenotype': 'TailShort,Brown'},
    'EEtt': {'gametes': ['Et'], 'probabilities': [1.0], 'phenotype': 'TailLong,Brown'},
    'eeTT': {'gametes': ['eT'], 'probabilities': [1.0], 'phenotype': 'TailShort,Black'},
    'eett': {'gametes': ['et'], 'probabilities': [1.0], 'phenotype': 'TailLong,Black'}
}

# Initialize lists for data collection
organism_type = []  # Flora or Fauna
parent1_genotype = []
parent1_gamete = []
parent2_genotype = []
parent2_gamete = []
offspring_genotype = []
offspring_phenotype = []

# Generate dataset
for i in range(n):
    # Randomly select if this is Flora or Fauna
    org_type = np.random.choice(['Flora', 'Fauna'])
    organism_type.append(org_type)
    
    if org_type == 'Flora':
        # Get random genotype for parent 1
        geno1 = np.random.choice(list(flora_genotypes.keys()))
        parent1_genotype.append(geno1)
        
        # Generate gamete based on parent 1 genotype
        gamete1 = np.random.choice(
            flora_genotypes[geno1]['gametes'],
            p=flora_genotypes[geno1]['probabilities']
        )
        parent1_gamete.append(gamete1)
        
        # Get random genotype for parent 2 (same organism type)
        geno2 = np.random.choice(list(flora_genotypes.keys()))
        parent2_genotype.append(geno2)
        
        # Generate gamete based on parent 2 genotype
        gamete2 = np.random.choice(
            flora_genotypes[geno2]['gametes'],
            p=flora_genotypes[geno2]['probabilities']
        )
        parent2_gamete.append(gamete2)
        
        # Determine offspring genotype based on gametes
        if len(gamete1) == 1 and len(gamete2) == 1:
            # Monohybrid cross (A x a)
            off_geno = ''.join(sorted(gamete1 + gamete2))
            # Phenotype follows dominance (A is dominant)
            off_phen = "Purple Flowers" if "A" in off_geno else "White Flowers"
        else:
            # Process gametes for dihybrid cross
            try:
                # For gametes like 'AB' and 'ab'
                first_allele = []
                second_allele = []
                
                # Process first gamete
                if len(gamete1) >= 2:
                    first_allele.append(gamete1[0])
                    second_allele.append(gamete1[1])
                else:
                    first_allele.append(gamete1)
                
                # Process second gamete
                if len(gamete2) >= 2:
                    first_allele.append(gamete2[0])
                    second_allele.append(gamete2[1])
                else:
                    first_allele.append(gamete2)
                
                # Sort alleles for consistent representation
                allele1 = ''.join(sorted(''.join(first_allele)))
                allele2 = ''.join(sorted(''.join(second_allele) if second_allele else ""))
                
                off_geno = allele1 + allele2
                
                # Determine phenotype based on dominance
                has_A = "A" in off_geno
                has_B = "B" in off_geno
                
                if has_A and has_B:
                    off_phen = "Purple Round"
                elif has_A and not has_B:
                    off_phen = "Purple Wrinkled"
                elif not has_A and has_B:
                    off_phen = "White Round"
                else:
                    off_phen = "White Wrinkled"
            except:
                # Fallback for any unexpected issues
                off_geno = gamete1 + gamete2
                off_phen = "Unknown"
            
    else:  # Fauna
        # Get random genotype for parent 1
        geno1 = np.random.choice(list(fauna_genotypes.keys()))
        parent1_genotype.append(geno1)
        
        # Generate gamete based on parent 1 genotype
        gamete1 = np.random.choice(
            fauna_genotypes[geno1]['gametes'],
            p=fauna_genotypes[geno1]['probabilities']
        )
        parent1_gamete.append(gamete1)
        
        # Get random genotype for parent 2 (same organism type)
        geno2 = np.random.choice(list(fauna_genotypes.keys()))
        parent2_genotype.append(geno2)
        
        # Generate gamete based on parent 2 genotype
        gamete2 = np.random.choice(
            fauna_genotypes[geno2]['gametes'],
            p=fauna_genotypes[geno2]['probabilities']
        )
        parent2_gamete.append(gamete2)
        
        # Determine offspring genotype and phenotype
        if len(gamete1) == 2 and len(gamete2) == 2:
            e1, t1 = gamete1[0], gamete1[1]
            e2, t2 = gamete2[0], gamete2[1]
            
            # Sort to ensure consistent representation (e.g. Ee not eE)
            e_alleles = ''.join(sorted(e1 + e2))
            t_alleles = ''.join(sorted(t1 + t2))
            
            off_geno = e_alleles + t_alleles
            
            # Determine phenotype based on dominance (E and T are dominant)
            has_E = 'E' in off_geno
            has_T = 'T' in off_geno
            
            if has_E and has_T:
                off_phen = "TailShort,Brown"
            elif has_E and not has_T:
                off_phen = "TailLong,Brown"
            elif not has_E and has_T:
                off_phen = "TailShort,Black"
            else:
                off_phen = "TailLong,Black"
        else:
            # Handle case where we have unexpected gamete format
            off_geno = gamete1 + gamete2
            
            # Just use simple dominance rules
            has_E = 'E' in off_geno
            has_T = 'T' in off_geno
            
            if has_E and has_T:
                off_phen = "TailShort,Brown"
            elif has_E and not has_T:
                off_phen = "TailLong,Brown"
            elif not has_E and has_T:
                off_phen = "TailShort,Black"
            else:
                off_phen = "TailLong,Black"
    
    # Store offspring data
    offspring_genotype.append(off_geno)
    offspring_phenotype.append(off_phen)

# Create DataFrame
df = pd.DataFrame({
    'OrganismType': organism_type,
    'Parent1Genotype': parent1_genotype,
    'Parent1Gamete': parent1_gamete,
    'Parent2Genotype': parent2_genotype,
    'Parent2Gamete': parent2_gamete,
    'OffspringGenotype': offspring_genotype,
    'OffspringPhenotype': offspring_phenotype
})

# Save to CSV
df.to_csv('combined_genetics_dataset.csv', index=False)

# Display a preview
print("Dataset Preview (10 samples):")
df.head(10)

Dataset Preview (10 samples):


Unnamed: 0,OrganismType,Parent1Genotype,Parent1Gamete,Parent2Genotype,Parent2Gamete,OffspringGenotype,OffspringPhenotype
0,Flora,AABB,ab,aa,a,aab,White Wrinkled
1,Flora,aaBB,aB,aaBB,aB,aaBB,White Round
2,Fauna,EEtt,Et,EeTT,ET,EETt,"TailShort,Brown"
3,Fauna,EeTt,et,EETt,ET,EeTt,"TailShort,Brown"
4,Flora,AA,A,AAbb,Ab,AAb,Purple Wrinkled
5,Flora,AA,A,aaBB,aB,AaB,Purple Round
6,Fauna,EeTt,Et,eeTT,eT,EeTt,"TailShort,Brown"
7,Fauna,EeTT,ET,EeTt,eT,EeTT,"TailShort,Brown"
8,Flora,AaBb,aB,AaBb,Ab,AaBb,Purple Round
9,Fauna,EeTt,et,eeTT,eT,eeTt,"TailShort,Black"
