# **Process for Wheat Yield Genotype Data**

This step requires the file F_MAF0.01_Miss50_Het10-Merged.all.discover.lines.and.selection.candidates.vcf.imputed.CIMMYT.vcf.gz to be located in the source_data folder. First, run the script 1_geno_processing.sh, and then execute the notebook 2_geno_processing.ipynb.

In [None]:
import csv

def transpose_and_rename_csv(input_file, output_file):
    try:
        with open(input_file, 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            data = list(csvreader)
        
        # Replace "ID" with "Gid" in the first cell
        data[0][0] = "Gid"
        
        # Transpose the data
        transposed_data = [[row[i] for row in data] for i in range(len(data[0]))]
        
        # Write the transposed data to a new CSV file
        with open(output_file, 'w', newline='') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerows(transposed_data)
        
        print(f'Transposition successful. Result saved to {output_file}')
    except Exception as e:
        print(f'Transposition failed: {str(e)}')

input_file = 'output/genotype.csv'
output_file = 'output/genotype_T.csv'
transpose_and_rename_csv(input_file, output_file)


In [None]:
import pandas as pd

# Read the genotype file and the unique identifier file
genotype_data = pd.read_csv("output/genotype_T.csv")
unique_gid_data = pd.read_csv("../1_Pheno/output/UniqueGid.csv")

# # Convert the Gid column in the unique identifier file to a set for filtering
unique_gid_set = set(unique_gid_data["Gid"])

# Filter the genotype file based on the Gid column
filtered_genotype_data = genotype_data[genotype_data["Gid"].isin(unique_gid_set)]

filtered_genotype_data.to_csv("output/genotype_T_trimed.csv", index=False)

In [None]:
import pandas as pd

def merge_genotype_data(allpheno_filename, genotype_filename, output_filename):
    allpheno_df = pd.read_csv(allpheno_filename)
    genotype_df = pd.read_csv(genotype_filename)
    
    genotype_df.set_index("Gid", inplace=True)
    
    merged_data = []
    
    for index, row in allpheno_df.iterrows():
        print(index)
        gid = row["Gid"]
        # Find the corresponding row in genotype_df
        genotype_row = genotype_df.loc[gid]
        # Append genotype data to pheno data and convert to list format
        merged_row = list(row.values) + list(genotype_row.values)
        # Add to the merged data list
        merged_data.append(merged_row)
    
    # Create a new DataFrame to save the merged data
    merged_df = pd.DataFrame(merged_data, columns=list(allpheno_df.columns) + list(genotype_df.columns))
    
    merged_df.to_csv(output_filename, index=False)

if __name__ == "__main__":
    # allpheno_filename = "test.csv"

    allpheno_filename = "../1_Pheno/output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv"
    genotype_filename = "output/genotype_T_trimed.csv"
    output_filename = "output/YieldGeno.csv"

    merge_genotype_data(allpheno_filename, genotype_filename, output_filename)