In [107]:
import pandas as pd
import os
import glob

resource = ['Project','Investigation','Study']
index = 102
version="3_3"

def sanitize_str(string):
    sep = [pos for pos, char in enumerate(string[:string.rfind('.')+1]) if char == '.']
    if not sep:
       string = string.lower().replace('.','_')
    else:
       string = string[sep[0]+1:].lower().replace('.','_')
    return string

def header(s):
  s="""puts 'Seeding NFDI4Health MDS {} to {} Extended Metadata ...'

@bool_type = SampleAttributeType.find_or_initialize_by(title: 'Boolean')
@bool_type.update(base_type: Seek::Samples::BaseType::BOOLEAN)

@int_type = SampleAttributeType.find_or_initialize_by(title: 'Integer')
@int_type.update(base_type: Seek::Samples::BaseType::INTEGER, placeholder: '1')

@float_type = SampleAttributeType.find_or_initialize_by(title: 'Real number')
@float_type.update(base_type: Seek::Samples::BaseType::FLOAT, placeholder: '0.5')

@date_type = SampleAttributeType.find_or_initialize_by(title: 'Date')
@date_type.update(base_type: Seek::Samples::BaseType::DATE, placeholder: 'January 1, 2015')

@string_type = SampleAttributeType.find_or_initialize_by(title: 'String')
@string_type.update(base_type: Seek::Samples::BaseType::STRING)

@cv_type = SampleAttributeType.find_or_initialize_by(title: 'Controlled Vocabulary')
@cv_type.update(base_type: Seek::Samples::BaseType::CV)

@text_type = SampleAttributeType.find_or_initialize_by(title: 'Text')
@text_type.update(base_type: Seek::Samples::BaseType::TEXT)

@link_type = SampleAttributeType.find_or_initialize_by(title: 'Web link')

@link_type.update(base_type: Seek::Samples::BaseType::STRING, regexp: URI.regexp(%w(http https)).to_s, placeholder: 'http://www.example.com', resolution: '\\0')

@cv_list_type = SampleAttributeType.find_or_initialize_by(title:'Controlled Vocabulary List')
@cv_list_type.update(base_type: Seek::Samples::BaseType::CV_LIST)

def create_sample_controlled_vocab_terms_attributes(array)
  attributes = []
  array.each do |type|
    attributes << {{ label: type }}
  end
  attributes
end
lang_array =["English","French","German",]
country_code_array = {{"FR"=>"France","DE"=>"Germany","ES"=>"Spain","GB"=>"United Kingdom","US"=>"United States",}}.values

disable_authorization_checks do""".format(str(version.replace("_",".")), s)
  return s
footer="""puts '... Done!'
end"""

data = glob.glob('*.csv')

for item in data:
  vars()[os.path.splitext(item)[0].replace('-','_')] = pd.read_csv(item, sep=';', quotechar='"', encoding='utf8')

AD_Design['No 3_3'] = AD_Design['No 3_3'].str.replace('1','1.17',1)
df = pd.concat( [AD_MDS_V3_3, AD_Design])

CustomDatatypes = {'ms': ['Resource.languages','Resource.contributors.organisational.fundingIds','Design.studyType.interventional','Design.studyType.nonInterventional',
                          'Design.groupsOfDiseases.generally','Design.groupsOfDiseases.conditions','Design.centers','Design.dataSource.general','Design.dataSource.biosamples',
                          'Design.dataSource.imaging','Design.dataSource.omics','Design.eligibilityCriteria.genders','Design.eligibilityCriteria.exclusionCriteria','Design.population.countries',
                          'Design.hypotheses','Design.interventions.armsLabel','Design.exposures.groupsLabel','Design.assessments','Design.dataSharingPlan.supportingInformation',
                          'Design.nonInterventional','Design.nonInterventional.timePerspectives','Design.nonInterventional.targetFollowUpDuration.frequency',
                          'Design.nonInterventional.biospecimenRetention','Design.interventional','Design.interventional.masking.general','Design.interventional.masking.roles'
                          ],
                  'text': ['Resource.descriptions.text']}

CM_nones = ['Resource.classification.typeGeneral','Resource.keywords.code','Resource.nonStudyDetails','Resource.nonStudyDetails.version','Resource.nonStudyDetails.format',
            'Resource.nonStudyDetails.useRights','Resource.nonStudyDetails.useRights.label','Resource.nonStudyDetails.useRights.link','Resource.nonStudyDetails.useRights.confirmations',
            'Resource.nonStudyDetails.useRights.confirmations.authority','Resource.nonStudyDetails.useRights.confirmations.terms','Resource.nonStudyDetails.useRights.confirmations.irrevocability',
            'Resource.nonStudyDetails.useRights.confirmations.supportByLicensing','Resource.nonStudyDetails.useRights.description','Resource.ids',
            'Resource.ids.identifier','Resource.ids.scheme','Resource.ids.relationType','Resource.ids.typeGeneral','Resource.idsNfdi4health','Resource.idsNfdi4health.identifier',
            'Resource.idsNfdi4health.date','Resource.idsNfdi4health.relationType','Resource.provenance','Resource.provenance.dataSource','Resource.provenance.verificationDate','Resource.provenance.verificationUser',
            'Resource.provenance.firstSubmittedDate','Resource.provenance.firstSubmittedUser','Resource.provenance.firstPostedDate','Resource.provenance.firstPostedUser','Resource.provenance.lastUpdateSubmittedDate',
            'Resource.provenance.lastUpdateSubmittedUser','Resource.provenance.lastUpdatePostedDate','Resource.provenance.lastUpdatePostedUser','Resource.provenance.resourceVersion',
            'Design.population.description','Design.population.targetSampleSize','Design.population.obtainedSampleSize','Design.hypotheses','Design.arms','Design.arms.label',
            'Design.arms.type','Design.arms.description','Design.groups','Design.groups.label','Design.groups.description']

df =df[~df['Item'].isin(CM_nones)]

for i in CustomDatatypes.keys():
  for item in CustomDatatypes[i]:
    df.loc[df['Item'] == item, 'CM'] = i

df['CM'] = df['CM'].fillna('...')

df = df.astype(str)

par_ch ={}
for i in df['No 3_3']:
    sep=[pos for pos, char in enumerate(i[:i.rfind('.')+1]) if char == '.']
    if len(i)>1:
        parent = i[:sep[-1]]
        if parent not in par_ch:
            par_ch[parent]=[i]

        else:
            par_ch[parent].append(i)

dic = dict(sorted(par_ch.items() ,key=lambda item: len(item[0]), reverse=True))
keys_set = set(dic.keys())


for val1 in resource:

  with open(f"./0{index}_MDS_{version}_{val1}_gen.seeds.rb", 'w') as file:
      file.write(header(val1))
      tn = '\n'
      tb = '  '
      typedict={"string":"@string_type","text":"@text_type", "url":"@link_type","date":"@date_type","boolean":"@bool_type","uri":"@string_type","integer":"@int_type","decimal":"@float_type"}


      for ind, (key , values) in enumerate(dic.items()):
          BackboneElement_name = df.loc[df['No '+version] == key, 'Item'].values[0].replace('.','_')+'_'+val1
          #Section_name = df.loc[df['No '+version] == key, 'Data element heading || Short name to display (display_name)'].values[0]

          Section_name = sanitize_str(BackboneElement_name).replace('_',' ')
          file.write(f"{tn}{tb}# ************* {Section_name} ****************{tn}" )

          # writing controlled vocabularies

          for value in values:

              if (df.loc[df['No '+version] == value, 'Data Type'].values[0] == 'CodeableConcept'):

                  item_cv_name = df.loc[df['No '+version] == value, 'Item'].values[0].replace('.','_')+'_'+val1
                  allowed_values = df.loc[df['No '+version] == value, 'Allowed Values'].values[0].replace(" \n","\n").split(tn)
                  if allowed_values[0] == "Language names from the ISO 639-1 list":
                      cv_cmd = f"""{tb}{item_cv_name}_cv = SampleControlledVocab.where(title: '{item_cv_name}').first_or_create!(
                      sample_controlled_vocab_terms_attributes: create_sample_controlled_vocab_terms_attributes(lang_array))"""
                  elif allowed_values[0] =="Country names from the ISO 3166-1 list":
                      cv_cmd = f"""{tb}{item_cv_name}_cv = SampleControlledVocab.where(title: '{item_cv_name}').first_or_create!(
                      sample_controlled_vocab_terms_attributes: create_sample_controlled_vocab_terms_attributes(country_code_array))"""
                  else:
                      cv_cmd = f"""{tb}{item_cv_name}_cv = SampleControlledVocab.where(title: '{item_cv_name}').first_or_create!(
                      sample_controlled_vocab_terms_attributes: create_sample_controlled_vocab_terms_attributes(
                          {allowed_values}))"""
                  file.write(cv_cmd.replace('"','')+ os.linesep)

          # initialise extended metadate type block
          if ind == len(dic.items())-1:
              #status = lambda val2: 'Interventional' if val2 == "_Is" else 'Non-Interventional'
              version2 = version.replace("_",".")
              CMT_string_header = f"{tb}unless ExtendedMetadataType.where(title:'Nfdi4Health MDS {str(version2)}', supported_type:'{val1}').any?{tn}{tb}{tb}emt = ExtendedMetadataType.new(title: 'Nfdi4Health MDS {str(version2)}', supported_type:'{val1}')"
              file.write(CMT_string_header)
          else:
              CMT_string_header = f"{tb}unless ExtendedMetadataType.where(title:'{BackboneElement_name}', supported_type:'ExtendedMetadata').any?{tn}{tb}{tb}emt = ExtendedMetadataType.new(title: '{BackboneElement_name}', supported_type:'ExtendedMetadata')"
              file.write(CMT_string_header)



          for value in values:
              item_name = df.loc[df['No '+version] == value, 'Item'].values[0].replace('.','_')+'_'+val1
              data_type = df.loc[df['No '+version] == value, 'Data Type'].values[0]
              CM_data_type = df.loc[df['No '+version] == value, 'CM'].values[0]
              if CM_data_type == 'text':
                data_type = 'text'
              description = df.loc[df['No '+version] == value, 'Data element description (description)'].values[0].replace("[RESOURCE]",val1)
              label = df.loc[df['No '+version] == value, 'Data element heading || Short name to display (display_name)'].values[0].replace('\u202f', ' ').replace("[RESOURCE]",val1)
              required = "(*)" if df.loc[df['No '+version] == value, 'Cardinality'].values[0] == "1..1" else ""

              if value in keys_set:
                  if '..*' in df.loc[df['No '+version] == value, 'Cardinality'].values[0]:
                      CMT_string_line = f"""emt.extended_metadata_attributes << ExtendedMetadataAttribute.new(title: '{item_name}', sample_attribute_type: SampleAttributeType.where(title:'Linked Extended Metadata (multiple)').first , linked_extended_metadata_type: ExtendedMetadataType.where(title:'{item_name}', supported_type:'ExtendedMetadata').first, label: '{label+required}' )"""
                  else:
                      #linkedCMT
                      CMT_string_line = f"""emt.extended_metadata_attributes << ExtendedMetadataAttribute.new(title: '{item_name}', sample_attribute_type: SampleAttributeType.where(title:'Linked Extended Metadata').first , linked_extended_metadata_type: ExtendedMetadataType.where(title:'{item_name}', supported_type:'ExtendedMetadata').first, label: '{label+required}' )"""

              elif (data_type != "CodeableConcept"):
                  CMT_string_line = f"""emt.extended_metadata_attributes << ExtendedMetadataAttribute.new(title: '{item_name}', sample_attribute_type: {typedict[data_type]}, sample_controlled_vocab: nil, description: '{description}', label: '{label+required}')"""

              elif (CM_data_type == "ms"):
                  CMT_string_line = f"""emt.extended_metadata_attributes << ExtendedMetadataAttribute.new(title: '{item_name}', sample_attribute_type: @cv_list_type, sample_controlled_vocab:{item_name}_cv, description:'{description}', label:'{label+required}')"""

              else :
                  CMT_string_line = f"""emt.extended_metadata_attributes << ExtendedMetadataAttribute.new(title: '{item_name}', sample_attribute_type: @cv_type, sample_controlled_vocab:{item_name}_cv, description:'{description}', label:'{label+required}')"""

              file.write(f"{tn}{tb}{tb}" + CMT_string_line)

          file.write(f"{tn}{tb}{tb}emt.save!{tn}{tb}end"+ os.linesep)

      file.write(footer + os.linesep)
      index += 1