In [2]:
def parse_detailed_description(file_path):
    # Dictionary to store the enhanced parsed data
    data_dict = {}
    current_key = None
    current_desc = ""

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            if ':' in line:
                # If a new key is found, reset current description
                if current_key:
                    # Determine type based on if any categorical values have been added
                    data_type = 'Categorical' if data_dict[current_key]['values'] else 'Numeric'
                    data_dict[current_key]['type'] = data_type
                # This line contains a new key and description
                parts = line.split(':')
                current_key = parts[0].strip()
                current_desc = parts[1].strip() if len(parts) > 1 else ""
                data_dict[current_key] = {'description': current_desc, 'values': {}}
            else:
                # This line contains values under the last key
                if current_key:
                    value_details = line.split('\t')
                    if len(value_details) == 2:
                        code, description = value_details
                        data_dict[current_key]['values'][code.strip()] = description.strip()

    # Finalize the last key
    if current_key and 'type' not in data_dict[current_key]:
        data_type = 'Categorical' if data_dict[current_key]['values'] else 'Numeric'
        data_dict[current_key]['type'] = data_type

    return data_dict

# Read and parse the data from the uploaded file
file_path = 'data_description.txt'  # Path to the uploaded file
parsed_data_detailed = parse_detailed_description(file_path)



In [3]:
parsed_data_detailed

{'MSSubClass': {'description': 'Identifies the type of dwelling involved in the sale.',
  'values': {'20': '1-STORY 1946 & NEWER ALL STYLES',
   '30': '1-STORY 1945 & OLDER',
   '40': '1-STORY W/FINISHED ATTIC ALL AGES',
   '45': '1-1/2 STORY - UNFINISHED ALL AGES',
   '50': '1-1/2 STORY FINISHED ALL AGES',
   '60': '2-STORY 1946 & NEWER',
   '70': '2-STORY 1945 & OLDER',
   '75': '2-1/2 STORY ALL AGES',
   '80': 'SPLIT OR MULTI-LEVEL',
   '85': 'SPLIT FOYER',
   '90': 'DUPLEX - ALL STYLES AND AGES',
   '120': '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
   '150': '1-1/2 STORY PUD - ALL AGES',
   '160': '2-STORY PUD - 1946 & NEWER',
   '180': 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
   '190': '2 FAMILY CONVERSION - ALL STYLES AND AGES'},
  'type': 'Categorical'},
 'MSZoning': {'description': 'Identifies the general zoning classification of the sale.',
  'values': {'A': 'Agriculture',
   'C': 'Commercial',
   'FV': 'Floating Village Residential',
   'I': 'Industrial',
   '

In [4]:
import json

# Convert the dictionary to a JSON string
json_data = json.dumps(parsed_data_detailed, indent=4)

# Define the path for the output JSON file
output_file_path = 'extracted_data_description.json'

# Write the JSON data to a file
with open(output_file_path, 'w') as file:
    file.write(json_data)

output_file_path


'extracted_data_description.json'

In [5]:
import json
# Read the JSON file we just created
with open(output_file_path, 'r') as file:
    json_loaded_data = json.load(file)

json_loaded_data

{'MSSubClass': {'description': 'Identifies the type of dwelling involved in the sale.',
  'values': {'20': '1-STORY 1946 & NEWER ALL STYLES',
   '30': '1-STORY 1945 & OLDER',
   '40': '1-STORY W/FINISHED ATTIC ALL AGES',
   '45': '1-1/2 STORY - UNFINISHED ALL AGES',
   '50': '1-1/2 STORY FINISHED ALL AGES',
   '60': '2-STORY 1946 & NEWER',
   '70': '2-STORY 1945 & OLDER',
   '75': '2-1/2 STORY ALL AGES',
   '80': 'SPLIT OR MULTI-LEVEL',
   '85': 'SPLIT FOYER',
   '90': 'DUPLEX - ALL STYLES AND AGES',
   '120': '1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
   '150': '1-1/2 STORY PUD - ALL AGES',
   '160': '2-STORY PUD - 1946 & NEWER',
   '180': 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
   '190': '2 FAMILY CONVERSION - ALL STYLES AND AGES'},
  'type': 'Categorical'},
 'MSZoning': {'description': 'Identifies the general zoning classification of the sale.',
  'values': {'A': 'Agriculture',
   'C': 'Commercial',
   'FV': 'Floating Village Residential',
   'I': 'Industrial',
   '