# **Process for Plant Height Phenotype Files**

The first step involves concatenating all downloaded plant height phenotypic files (e.g., the “PLNTHT” sheet in 46 IBWSN.xls) into the following format (All.csv).

```csv
Trial name,Occ,Loc_no,Cycle,Cid,Sid,Value,Gid
24 ESWYT,1.0,14002.0,2003,61665,1,97,304660
24 ESWYT,1.0,14002.0,2003,8195,5,78,16004
24 ESWYT,1.0,14002.0,2003,160278,68,74,3617481
24 ESWYT,1.0,14002.0,2003,384555,1,72,2668073
24 ESWYT,1.0,14002.0,2003,113388,5,79,1491661
24 ESWYT,1.0,14002.0,2003,162515,1,74,217743
24 ESWYT,1.0,14002.0,2003,67613,97,79,4057141
```

Additionally, concatenate all corresponding CID, SID, and GID values into the following format (CidSidGid.csv):

```csv
CID,SID,GID
61665,1,304660
60115,215,3820651
73574,1908,3829327
74933,481,3833626
```

Combine the rows containing the keywords “SOWING_DATE” and “HARVEST_FINISHING_DATE” from files in the phenotype files that include “EnvData” in their names into two separate files, named SOWING_DATE.csv and HARVEST_FINISHING_DATE.csv, respectively, as follows:

```csv
Trial name,Occ,Loc_no,Country,Loc_desc,Cycle,Trait No,Trait name,Value,Unit,Year,Month,Day
24 ESWYT,1,14002,ANGOLA,HUMPATA,2003,3,SOWING_DATE,Mar 1 2003,date,2003,3,1
24 ESWYT,6,11109,ZIMBABWE,CHISIPITE,2003,3,SOWING_DATE,May 8 2003,date,2003,5,8
24 ESWYT,7,22620,PAKISTAN,SAKRAND,2003,3,SOWING_DATE,Dec 6 2003,date,2003,12,6
```

```csv
Trial name,Occ,Loc_no,Country,Loc_desc,Cycle,Trait No,Trait name,Value,Unit,Year,Month,Day
24 ESWYT,1,14002,ANGOLA,HUMPATA,2003,9,HARVEST_FINISHING_DATE,Jul 23 2003,date,2003,7,23
24 ESWYT,6,11109,ZIMBABWE,CHISIPITE,2003,9,HARVEST_FINISHING_DATE,Oct 27 2003,date,2003,10,27
24 ESWYT,8,22607,PAKISTAN,NARC ISLAMABAD,2003,9,HARVEST_FINISHING_DATE,Apr 28 2004,date,2004,4,28
```

In [None]:
import pandas as pd

all_with_gid = pd.read_csv('source_data/All.csv')

all_with_gid = all_with_gid.dropna()

all_with_gid['Gid'] = 'GID' + all_with_gid['Gid'].astype(int).astype(str)

all_with_gid['Occ'] = all_with_gid['Occ'].astype(int)
all_with_gid['Loc_no'] = all_with_gid['Loc_no'].astype(int)

all_with_gid.to_csv('output/AllWithGidDropMissingValues.csv', index=False)

In [3]:
import pandas as pd

allpheno_df = pd.read_csv("output/AllWithGidDropMissingValues.csv")

sowing_date_df = pd.read_csv("source_data/SOWING_DATE.csv", dtype={"Year": int, "Month": int, "Day": int})
sowing_date_df.drop_duplicates(subset=["Occ", "Loc_no", "Cycle"], keep="first", inplace=True)

sowing_date_dict = sowing_date_df.set_index(["Occ", "Loc_no", "Cycle"])[["Year", "Month", "Day"]].to_dict(orient="index")

def get_date(row):
    key = (row["Occ"], row["Loc_no"], row["Cycle"])
    date_data = sowing_date_dict.get(key)
    if date_data:
        return int(date_data["Year"]), int(date_data["Month"]), int(date_data["Day"])
    return None, None, None

allpheno_df[["SowYear", "SowMonth", "SowDay"]] = allpheno_df.apply(get_date, axis=1, result_type="expand")

allpheno_df.to_csv("output/AllWithGidDropMissingValuesSow.csv", index=False)

In [4]:
import pandas as pd

allpheno_df = pd.read_csv("output/AllWithGidDropMissingValuesSow.csv")

sowing_date_df = pd.read_csv("source_data/HARVEST_FINISHING_DATE.csv", dtype={"Year": int, "Month": int, "Day": int})
sowing_date_df.drop_duplicates(subset=["Occ", "Loc_no", "Cycle"], keep="first", inplace=True)

sowing_date_dict = sowing_date_df.set_index(["Occ", "Loc_no", "Cycle"])[["Year", "Month", "Day"]].to_dict(orient="index")

def get_date(row):
    key = (row["Occ"], row["Loc_no"], row["Cycle"])
    date_data = sowing_date_dict.get(key)
    if date_data:
        return int(date_data["Year"]), int(date_data["Month"]), int(date_data["Day"])
    return None, None, None

allpheno_df[["HarYear", "HarMonth", "HarDay"]] = allpheno_df.apply(get_date, axis=1, result_type="expand")

allpheno_df = allpheno_df.dropna()

allpheno_df[["SowYear", "SowMonth", "SowDay", "HarYear", "HarMonth", "HarDay"]] = allpheno_df[["SowYear", "SowMonth", "SowDay", "HarYear", "HarMonth", "HarDay"]].astype(int)

allpheno_df['SowDate'] = pd.to_datetime(allpheno_df[['SowYear', 'SowMonth', 'SowDay']].astype(str).agg('-'.join, axis=1))
allpheno_df['HarDate'] = pd.to_datetime(allpheno_df[['HarYear', 'HarMonth', 'HarDay']].astype(str).agg('-'.join, axis=1))

allpheno_df['Days'] = (allpheno_df['HarDate'] - allpheno_df['SowDate']).dt.days

allpheno_df.drop(['SowDate', 'HarDate'], axis=1, inplace=True)

allpheno_df.to_csv("output/AllWithGidDropMissingValuesSowHar.csv", index=False)

In [5]:
import pandas as pd

def read_txt_file(file_path):
    with open(file_path, 'r') as txt_file:
        values = txt_file.read().split()
    return set(values)

def filter_csv_file(input_file, txt_file, output_file):
    values_set = read_txt_file(txt_file)

    df = pd.read_csv(input_file)

    filtered_df = df[df['Gid'].isin(values_set)]

    filtered_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_csv_file = "output/AllWithGidDropMissingValuesSowHar.csv"
    txt_file = "../2_Geno/output/genotype_ID.txt" 
    output_csv_file = "output/AllWithGidDropMissingValuesSowHarFilteredGid.csv"

    filter_csv_file(input_csv_file, txt_file, output_csv_file)

In [None]:
import pandas as pd

file_before2020 = 'output/AllWithGidDropMissingValuesSowHarFilteredGid.csv'
file_weather_data = '../3_Env/output/IWIN_Weather_AgERA5_20210211.csv'
output_file = 'output/AllWithGidDropMissingValuesSowHarFilteredGidLoc.csv'

data_before2020 = pd.read_csv(file_before2020)
data_weather = pd.read_csv(file_weather_data)

locations_before2020 = set(data_before2020['Loc_no'])
locations_weather_data = set(data_weather['location'])

cleaned_data = data_before2020[data_before2020['Loc_no'].isin(locations_weather_data)]

deleted_data = data_before2020[~data_before2020['Loc_no'].isin(locations_weather_data)]

if not deleted_data.empty:
    print(f"Deleted {len(deleted_data)} rows where 'Loc_no' is not in 'location'.")
    deleted_file_name = 'output/deleted_data.csv'
    deleted_data.to_csv(deleted_file_name, index=False)
    print(f"Deleted rows saved to file: {deleted_file_name}")
else:
    print("All 'Loc_no' values exist in 'location'. No rows were deleted.")

if not cleaned_data.empty:
    print(f"Cleaned {len(cleaned_data)} rows where 'Loc_no' exists in 'location'.")
    cleaned_data.to_csv(output_file, index=False)
    print(f"Cleaned data saved to file: {output_file}")
else:
    print("No rows to clean. All 'Loc_no' values exist in 'location'.")

In [7]:
import pandas as pd

df = pd.read_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLoc.csv')

condition = (df['Days'] > 270) | (df['HarYear'] >= 2020 ) | (df['Value'] == "-" )

filtered_df = df[condition]

filtered_df.to_csv('output/Unormal.csv', index=False)

df.drop(filtered_df.index, inplace=True)

df.to_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormal.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormal.csv')

df_duplicates = df[df.duplicated(keep='first')]
df.drop_duplicates(keep='first', inplace=True)

if not df_duplicates.empty:
    print("The first duplicate row:")
    print(df_duplicates.head())

df_duplicates.to_csv('output/Duplicate.csv', index=False)
print("The removed duplicate rows have been saved to Duplicate.csv.")

df.to_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv', index=False)
print("The updated data has been saved to AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv.")

In [9]:
"提取去除重复后的环境型"
import pandas as pd

df = pd.read_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv')

df['Occ_Loc_no_Cycle'] = df['Occ'].astype(str) + '_' + df['Loc_no'].astype(str) + '_' + df['Cycle'].astype(str)

df = df.drop_duplicates(subset='Occ_Loc_no_Cycle', keep='first')

df.to_csv('output/UniqueOccLocCycle.csv', columns=['Occ_Loc_no_Cycle'], index=False)

In [10]:
"提取去除重复后的基因型"
import pandas as pd

df = pd.read_csv('output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv')

unique_gid_series = df['Gid'].drop_duplicates(keep='first')

new_df = pd.DataFrame({'Gid': unique_gid_series})

new_df.to_csv('output/UniqueGid.csv', index=False)