In [1]:
import sys
functions_at = '/proj/modeling/abhi'
sys.path = [f'{functions_at}'] + sys.path
from model_CNN_source import *
tmp_at = '/proj/tmp_data'
dump_at = '/proj/dump'
source_data = '/proj/source_data/Training_Data'
source_data_2 = '/proj/source_data/Testing_Data'
processed_data = '/proj/processed_data'

In [2]:
## trait_data
trait_data = pd.read_csv(f"{source_data}/1_Training_Trait_Data_2014_2021.csv")
trait_data = trait_data[trait_data.Yield_Mg_ha.notnull()]
## meta data
meta_data = pd.read_csv(f"{source_data}/2_Training_Meta_Data_2014_2021_utf_encoded.csv").add_prefix("mt_dta_")
## geno data
geno_data = pd.read_csv(f"{processed_data}/geno_processed.miss.1.mac.1.biallelic.txt").add_prefix("ge_dta_")
geno_data = geno_data.iloc[:, 0:10] # it haas no missing data
## soil data
soil_data = pd.read_csv(f"{source_data}/3_Training_Soil_Data_2015_2021.csv").add_prefix("sl_dta_")
## weather data
weather_data = pd.read_csv(f"{source_data}/4_Training_Weather_Data_2014_2021.csv").add_prefix("wt_dta_")
## EC data
ec_data = pd.read_csv(f"{source_data}/6_Training_EC_Data_2014_2021.csv").add_prefix("ec_dta_")
# testing data to be provided
submission_data = pd.read_csv(f"{source_data_2}/1_Submission_Template_2022.csv")

In [3]:
# remove columns with excess missing data

# pheno data 
pheno_data = purge_excess_missing(trait_data, id_cols = ["Env", "Year", "Field_Location", "Experiment", "Replicate", "Block", "Plot"], plot = False) # the plot shows missing values in columns not in the id_cols list

# meta data
meta_data_purged = purge_excess_missing(meta_data, id_cols = ["mt_dta_Year", "mt_dta_Env", "mt_dta_Experiment_Code"], plot = False)

# soil_data overview
soil_data_purged = purge_excess_missing(soil_data, id_cols = ["sl_dta_Year", "sl_dta_Env"], plot = False)

# weather data reshape
weather_data.groupby(["wt_dta_Env"])["wt_dta_Env"].count()
weather_data["wt_dta_moth_dy"] = weather_data["wt_dta_Date"].astype(str).str[4:8]
weather_data_wide = weather_data.pivot(index="wt_dta_Env", columns="wt_dta_moth_dy", values=weather_data.columns.tolist()[2:-1])
weather_data_wide.columns = ['_'.join(map(str, x)) for x in weather_data_wide.columns] #212 * (16*366)
weather_data_wide_df = weather_data_wide.reset_index()
weather_data_purged = purge_excess_missing(weather_data_wide_df, id_cols = ["wt_dta_Env"], plot = False)

# ec data
ec_data_purged = purge_excess_missing(ec_data, id_cols = ["ec_dta_Env"], plot = False)

# remove objects to free memory
del trait_data
del meta_data
del soil_data
del weather_data
del weather_data_wide
del ec_data

In [4]:
# remove those environments which are completele missing soil, weather, or ec data
purge_1 = pheno_data[pheno_data.Env.isin(weather_data_purged.wt_dta_Env.unique())]
purge_2 = purge_1[purge_1.Env.isin(ec_data_purged.ec_dta_Env.unique())]
purge_3 = purge_2[purge_2.Env.isin(soil_data_purged.sl_dta_Env.unique())]
purge_4 = purge_3[purge_3.Hybrid.isin(geno_data.ge_dta_Hybrid.unique())]

print(f'pheno_data has {pheno_data.Env.nunique()} environments, purge 1 removes {pheno_data.Env.nunique() - purge_1.Env.nunique()}, purge 2 removes {purge_1.Env.nunique() - purge_2.Env.nunique()}, purge 3 removes {purge_2.Env.nunique() - purge_3.Env.nunique()}, thus the final data has {purge_3.Env.nunique()} environments')
print(f'purge 4 removed {purge_3.shape[0] - purge_4.shape[0]} rows due to non availability of genotypic data. Data now has {purge_4.Env.nunique()} environments and {purge_4.Hybrid.nunique()} unique genotypes in {purge_3.shape[0]} rows')

# we can pull env and soil data from public data bases to put them back in but lets start with a conservative set.

# join data together for the environments with all data available
merged_data = purge_4.merge(soil_data_purged, how="left", left_on="Env", right_on="sl_dta_Env").merge(weather_data_purged, how="left", left_on="Env", right_on="wt_dta_Env").merge(ec_data_purged, how="left", left_on="Env", right_on="ec_dta_Env")

pheno_data has 217 environments, purge 1 removes 5, purge 2 removes 52, purge 3 removes 51, thus the final data has 109 environments
purge 4 removed 1553 rows due to non availability of genotypic data. Data now has 109 environments and 3827 unique genotypes in 69360 rows


In [5]:
# Check missing data freuency for all explanotory variables in each env
id_cols = ["Env"]
data_missing = merged_data[merged_data.columns.difference(id_cols)].isna()
data_missing = pd.concat([merged_data.loc[:, id_cols], data_missing], axis = 1)
data_missing_overview = data_missing.groupby(id_cols[0]).sum()/merged_data.groupby("Env").count() # gives propotions of missing values per env for a given variable
#for i in range(data_missing_overview.shape[0]):
#    print(pd.cut(x=data_missing_overview.iloc[i, :].values.tolist(), bins=[-0.1, 0, 0.20, 0.40, 0.60, 0.80, 1]).unique()) # mostly less than 20 percent missing epr env. so i impute these
data_missing_overview.to_csv(f"{processed_data}/missign_overview_per_env.csv", index = False)

In [6]:
# define coltypes to impute
coltypes = merged_data[merged_data.columns.difference(pheno_data.columns)].dtypes.values.astype("str")
data_select = []
for Type in coltypes:
    if "float64" in Type:
        data_select.append(True)
    elif "int64" in Type :
        data_select.append(True)
    else:
        data_select.append(False)
data_nonselect = [not x for x in data_select]
col_select = merged_data[merged_data.columns.difference(pheno_data.columns)].columns[data_select].tolist()
col_select_with_env = ["Env"]+col_select 
col_nonselect = merged_data[merged_data.columns.difference(pheno_data.columns)].columns[data_nonselect].tolist()
col_nonselect = pheno_data.columns.tolist() + col_nonselect # will not be imputed

merged_data_raw = merged_data.loc[:, col_nonselect]
merged_data_to_impute = merged_data.loc[:, col_select_with_env]

In [7]:
# impute 
store_exceptions = []
merged_data_imputed = merged_data_to_impute.iloc[:, 0:1]
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    for i, col in enumerate(col_select):
        if i%1000 == 0:
            print(f'finished for {i}')
        val = None
        val = merged_data_to_impute.groupby(['Env'])[col].apply(lambda x: x.fillna(x.median()))
        if val.isnull().sum() == 0:
            merged_data_imputed = pd.concat([merged_data_imputed, val], axis = 1) # store only those columns which can be imputed
        else:
            store_exceptions.append(col)
            print(f'{col} has issues with median value calculation. check it manually')
# sanity check
data_missing = merged_data_imputed.isna().sum()/merged_data_imputed.shape[0]
data_missing[data_missing != 0]
final_imputed_data = pd.concat([merged_data_raw, merged_data_imputed[merged_data_imputed.columns.difference(["Env"])]], axis = 1)

finished for 0
sl_dta_% Clay has issues with median value calculation. check it manually
sl_dta_% Sand has issues with median value calculation. check it manually
sl_dta_% Silt has issues with median value calculation. check it manually
sl_dta_%Ca Sat has issues with median value calculation. check it manually
sl_dta_%H Sat has issues with median value calculation. check it manually
sl_dta_%K Sat has issues with median value calculation. check it manually
sl_dta_%Mg Sat has issues with median value calculation. check it manually
sl_dta_%Na Sat has issues with median value calculation. check it manually
sl_dta_1:1 S Salts mmho/cm has issues with median value calculation. check it manually
sl_dta_1:1 Soil pH has issues with median value calculation. check it manually
sl_dta_CEC/Sum of Cations me/100g has issues with median value calculation. check it manually
sl_dta_Calcium ppm Ca has issues with median value calculation. check it manually
sl_dta_E Depth has issues with median value calc

In [8]:
final_imputed_data.to_csv(f"{processed_data}/combined_mat.csv", index = False)

In [9]:
# assuming that we arrive on object called final_data after filtereing and if needed some imputation

final_data = final_imputed_data.loc[:, ['Env', 'Year', "Hybrid"]]

In [10]:
# create train test splits and save them V_1: true cv without a val set. you can sample 10 percent from the trian set to be val set

fold = 10
runs = 100
run = 0
out_dict_1 = {}
while run < runs:
    out_dict_2 = {}
    if run > 0:
        del kf
    kf = KFold(n_splits=fold, random_state=40+(20*run), shuffle=True)
    fold = 0
    for train_index, test_index in kf.split(final_data.index):
        in_dict = {}
        in_dict["train"] = train_index.tolist() # does not work since the train set looses all info 
        in_dict["test"] = test_index.tolist()
        out_dict_2[fold] = in_dict
        fold += 1
    out_dict_1[run] = out_dict_2
    run += 1
# write json file
# write_json(data = out_dict_1, path = f"{processed_data}/train_test_split.json")

In [12]:
# create train test splits and save them V_2. Custom cv with val set

sets={}
hld_year_geno=[]

total_years = final_data.Year.unique()
for year in total_years:
    for rep in range(0,10):
        # test data
        test_data = final_data[final_data.Year.isin([year])]
        test_geno_total = test_data.Hybrid.unique().tolist()
        test_geno_sub = random.sample(test_geno_total, int(len(test_geno_total)*0.2))
        test_set = test_data[test_data.Hybrid.isin(test_geno_sub)]
        
        # val data
        train_data = final_data[~final_data.Year.isin([year])]
        train_data_sub = train_data[~train_data.Hybrid.isin(test_geno_sub)]
        
        val_idx = sorted(random.sample(train_data_sub.index.tolist(), int(0.1*len(train_data_sub.index.tolist()))))
        val_set = train_data_sub.loc[val_idx].copy()
        
        # train_data
        train_set = train_data_sub[~train_data_sub.index.isin(val_idx)]
        
        # sanity checks
        if len(train_set[train_set["Year"].isin(test_set["Year"].unique().tolist())]) !=0:
            print("CONTAMINATED SETS: Year")
        if len(train_set[train_set["Hybrid"].isin(test_set["Hybrid"].unique().tolist())]) !=0:
            print("CONTAMINATED SETS: Genotype")
            
        # produce output
        sets[str(year)+"@"+str(rep)]={"train":train_set.index.tolist(),
                 "val":val_set.index.tolist(),
                 "test":test_set.index.tolist()}
                #record data for diagnostic purposes
        hld_year_geno.append([year, rep, len(test_set["Hybrid"].unique())/len(test_geno_total),
                              len(train_set["Hybrid"].unique())/len(final_data["Hybrid"].unique()),
                              len(train_set), len(val_set), len(test_set)])
print(len(sets))
# write json file
write_json(data = sets, path = f"{processed_data}/train_test_split_v2.json") # can think of saving the hld_year_geno also if needed

70
Done


In [2]:
check = read_json(f"{processed_data}/train_test_split_v2.json")

In [3]:
check

{'2015@0': {'train': [4784,
   4787,
   4788,
   4789,
   4790,
   4791,
   4792,
   4793,
   4794,
   4797,
   4798,
   4799,
   4800,
   4801,
   4802,
   4803,
   4804,
   4805,
   4807,
   4808,
   4810,
   4811,
   4812,
   4813,
   4814,
   4815,
   4816,
   4817,
   4818,
   4819,
   4821,
   4822,
   4823,
   4824,
   4825,
   4826,
   4827,
   4828,
   4829,
   4830,
   4831,
   4832,
   4833,
   4834,
   4835,
   4836,
   4837,
   4838,
   4839,
   4840,
   4841,
   4842,
   4843,
   4845,
   4846,
   4847,
   4848,
   4849,
   4850,
   4851,
   4853,
   4854,
   4855,
   4856,
   4857,
   4858,
   4859,
   4860,
   4861,
   4862,
   4864,
   4865,
   4866,
   4868,
   4870,
   4871,
   4873,
   4874,
   4875,
   4878,
   4880,
   4881,
   4882,
   4883,
   4884,
   4887,
   4888,
   4889,
   4891,
   4892,
   4893,
   4894,
   4895,
   4897,
   4898,
   4899,
   4900,
   4903,
   4904,
   4905,
   4906,
   4907,
   4908,
   4909,
   4910,
   4911,
   4912,
   4913,
   4915,


In [9]:
#todo: create train test splits and save them V_3

#geno_per_env_cutoff=0
genos_per_Env = final_data.pivot_table(index="Env", values="Hybrid", 
                                               aggfunc=lambda x: np.count_nonzero(np.unique(x))).sort_values("Hybrid")
genos_per_Env.reset_index(inplace=True)
print(len(genos_per_Env))

217
