<a href="https://colab.research.google.com/github/Pretorian29/Machine-Learning-Repos/blob/main/LyraDataReader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
"""
Script 1

XML reader and extractor code

1. Reads the data in XML file
2. Extracts the data required for the X,Y training of Lyra
3. Stores the result in two separated dataframes
4. Export the results in two CSV files

Paola Montoya 2021
"""
#                                                       Libraries
#------------------------------------------------------------------
from bs4 import BeautifulSoup 
import pandas as pd
#import datetime
from datetime import datetime

pd.set_option('display.max_columns', None) # displays max num of columns
start = datetime.now() # To set a timer for the process

print('\n1. Libraries imported!')


1. Libraries imported!


In [16]:
#                                             Data Reading / Parsing
#------------------------------------------------------------------
# Reading the data inside the xml
data_path = 'full_db.xml'
with open(data_path, 'r', encoding="utf8") as f:
	data = f.read()

# Passing the stored data inside, the beautifulsoup parser, storing the returned object
raw_data = BeautifulSoup(data, "xml")

# Reading the number of records in file
rec_count = raw_data.find('adb_entry_count')
r_c = rec_count.get('count')
print('\n2. Total records in this file: ', r_c)

# Creating the array of objects (records)
v_records = raw_data.find_all('adb_entry')



2. Total records in this file:  69230


In [17]:
#                                   Data Mining (Extraction of x, y)
#------------------------------------------------------------------
input_feat = [] # Lista que contendra input features de cada record (x)
output_feat = [] # Contendra la lista de categorias y eventos (y)

def extractor(v_records):
    
    for record in v_records:  # v_records contiene la lista de Records   
        x_i = {} # This list will store the X values
        y_o = {} # This list will store the Y values
        
        # Record ID extraction
        v_entry_val = record.get('adb_id')
            
        # Gender extraction     
        v_gen_val = record.find('gender') 
        v_sex_val = v_gen_val.get('csex').capitalize()
                
        # Rating extraction
        v_rating_val = record.find('roddenrating').text
           
        # Data type extraction
        v_dtype = record.find('datatype')
        v_dtype_val = v_dtype.get('dtc')
        
        # Add only individuals, known or anonymous
        if int(v_dtype_val) > 4:
            continue
        
        # Birth date details extraction
        v_dob = record.find('sbdate')
        v_dob_val = v_dob.get('iday')
        v_mob_val = v_dob.get('imonth')
        v_yob_val = v_dob.get('iyear')
       
        # Time of birth extraction
        v_tob_val = record.find('sbtime').text #String
                
        # Converting time var into object
        try:
            t_obj = datetime.strptime(v_tob_val, '%H:%M')
        except ValueError as e:
            if e == ValueError:
                t_obj = datetime.strptime(v_tob_val, '%H:%M:%S')                 
            else:
                t_obj = datetime.strptime('12:00', '%H:%M')    
                              
                
        # Splitting Time into Hours:Minutes format
        hour = datetime.strftime(t_obj, "%H")
        minute = datetime.strftime(t_obj, "%M")
        
        #print(t_obj, 'id: ', v_entry_val)
                
        # Place of birth extraction
        v_pob_val = record.find('place').text
        v_cob_val = record.find('country').text
        
        # Astrological Positions
        v_pos = record.find('positions')
        v_asc_val = v_pos.get('asc_sign')
        v_moon_val = v_pos.get('moon_sign')
        v_sun_val = v_pos.get('sun_sign')
       
        # Grouping all the data for each record
        x_i['ID'] = (v_entry_val)
        y_o['ID'] = (v_entry_val)
        
        x_i['Gender'] = (v_sex_val)
        
        #x_i['Type'] = (v_dtype_val)
        
        x_i['Day'] = (v_dob_val) # DOB
        x_i['Month'] = (v_mob_val)
        x_i['Year'] = (v_yob_val)
        
        x_i['Hour'] = (hour) # Hour of birth
        x_i['Minute'] = (minute) # Minute of birth
        x_i['City'] = (v_pob_val) # City
        x_i['Country'] = (v_cob_val) # Country
        
        x_i['ASC'] = str(v_asc_val) # Ascendant sign
        x_i['Moon'] = str(v_moon_val) # Moon sign
        x_i['Sun'] = str(v_sun_val) # Sun sign
        
        # Reading the number of categories in record (internal)
        cats = record.find('categories')    
        #cat_count = cats.get('count')
        #print ('\nCategories found: ', cat_count)
               
        # Reading the number of events in record (internal)
        v_events = record.find_all('event') 
        #event_count = len(v_events)
        #print ('\nTotal events found: ', event_count)
        
        # Filtering the rating of accuracy of records
        if v_rating_val == 'AA' or 'A' or 'B': 
            
            c_l = []
            
            # Categories Extraction
            v_cat = cats.find_all(attrs={'adb_id' : v_entry_val})            
            for c, v_cat_id in enumerate(v_cat):
                v_id = v_cat_id.get('cat_id')
                c_l.append(int(v_id))
            
            y_o['cat'] = c_l
                
            e_l = []
            
            # Events Extraction            
            for e, event in enumerate(v_events):
                v_event_id = event.get('evn_id')
                e_l.append(int(v_event_id))
            
            y_o['eve'] = e_l                     
            
           
            input_feat.append(x_i)
            
            output_feat.append(y_o)    
            
        continue
    
       
extractor(v_records)
print('Features Extracted!')


Features Extracted!


In [18]:
#                                                 DataFrame for X 
#------------------------------------------------------------------    
astro_x_table = pd.DataFrame(input_feat)
print(astro_x_table)

rec_count = len(input_feat)-1
print('\n     Total of valid records: ', rec_count)

          ID Gender Day Month  Year Hour Minute               City  \
0          1      M  12     9  1494   22     00             Cognac   
1          2      M  24     9  1501   18     29              Pavia   
2          3      F  13     4  1519   05     45           Florence   
3          4      M  20     7  1529   01     21             London   
4          5      F   7     9  1533   14     54          Greenwich   
...      ...    ...  ..   ...   ...  ...    ...                ...   
66708  95318      F  29    11  1988   14     03        Morrisville   
66709  95319      F  17     5  1994   18     32              Poway   
66710  95320      F  10     7  1997   07     58  Neuilly sur Seine   
66711  95321      F  28    12  1963   14     50               Agen   
66712  95322      M   7     1  1945   11     47             Maseno   

          Country  ASC Moon  Sun  
0          France  gem  aqu  vir  
1           Italy  tau  pis  lib  
2           Italy  tau  lib  tau  
3         England  

In [19]:
#                                                Data Frame for Y 
#------------------------------------------------------------------    
# Creating the Y dataframe
astro_y_table = pd.DataFrame(output_feat)
print(astro_y_table)

print ('\n3. Astro dataframes X, Y extracted...')

          ID                                                cat  \
0          1                                   [1928, 262, 151]   
1          2                [210, 267, 469, 483, 622, 619, 633]   
2          3                      [86, 190, 196, 198, 151, 631]   
3          4                                         [144, 208]   
4          5  [1, 7, 84, 994, 2906, 52, 152, 1023, 189, 207,...   
...      ...                                                ...   
66708  95318                                   [149, 2960, 547]   
66709  95319                              [176, 177, 4014, 483]   
66710  95320                                              [340]   
66711  95321                                              [399]   
66712  95322                    [149, 193, 198, 208, 496, 2953]   

                   eve  
0                [771]  
1           [794, 771]  
2      [810, 820, 766]  
3                [771]  
4      [796, 991, 771]  
...                ...  
66708               

In [20]:
#------------------------------------------------------------------
# Creation of CSV files of data sets

print('\n     Creating CSV files, please wait...')

astro_x_table.to_csv('X_data.csv', index = True) # CSV file of X dataset
astro_y_table.to_csv('y_data.csv', index = True) # CSV file of y dataset

print('\n     CSV Files created succesfully!')

print ('\n4. Astro dataframes X, Y ready for trining: ')


end = datetime.now()
t_elapsed= end-start

print ('\nTotal time for pre-processing: ', t_elapsed)
print('')
print ('\nEnd of process.')

#------------------------------------------------------------------
__author__ = 'Paola Montoya 2021'


     Creating CSV files, please wait...

     CSV Files created succesfully!

4. Astro dataframes X, Y ready for trining: 

Total time for pre-processing:  0:03:36.363775


End of process.
