In [5]:

import pandas as pd
import arff
import os 
import re

In [6]:
# Import metadata 
dimension=pd.read_csv('dimension/dimension.csv',delimiter=';')
sector=pd.read_csv('dimension/sector_dimension.csv')

In [7]:
# Clean the .arff file:
# Step 1: Clean random letters appearing and convert them to m
# Step 2: Clean the letter m in the Sector column which should be numeric,
# we know it`s the last column, convert m to ?
# which is the null value for numeric attributes in .arff files
# Step 3: Remove rows where the country Italy appears inb the first column

for filename in os.listdir('data/'):# Loop through each file in the directory
    with open(f'data/{filename}', 'r') as rf: # Open the .arff file
        # Read lines from the file
        lines = rf.readlines()

    for i, line in enumerate(lines):
        
        # Step 1: 
        # Clean random letters appearing and convert them to m
        pattern = r',([a-z]),'# Regex to match comma + single lowercase letter + comma
        # Perform the substitution
        lines[i] = re.sub(pattern, lambda x: ',m,', line)
        
        # Step 2:
        # If there is an m in the last three characters
        if 'm' in line[-3:]:
            # If yes, replace 'm' with '?' in the last 3 characters
            lines[i] = line[:-3] + line[-3:].replace('m', '?')
    
        # Step 3:
        # Split the string by commas
        string_elements = line.split(',')
        # Get the first value
        first_value = string_elements[0]

        # Check if the first value is Italy
        if first_value == 'Italy':
            # If not, mark the line with 'Remove'
            lines[i] = 'Remove'
            
    # Let`s remove all the lines=='Remove'
    lines = [line for line in lines if line != 'Remove']     
    
    # Write modified lines back to a new ARFF file
    with open(f'data_modified/{filename}', 'w') as wf:
        wf.writelines(lines)

In [8]:
# Import financial data

# Initialize an empty list to store DataFrames
dfs_list = []

for filename in os.listdir('data_modified/'):# Loop through each file in the directory
    with open(f'data_modified/{filename}', 'r') as f: # Open the .arff file
    
        raw_data = arff.load(f) # dict obj
        # Store arff in a pandas df
        df = pd.DataFrame(raw_data['data'],columns=[x[0] for x in raw_data['attributes']])
        
        # Create a dict with the mappings
        mappings = dict(zip(dimension['Variable Name'], dimension['Description']))
        # Rename columns in df using the mapping
        df.rename(columns=mappings,inplace=True)

        # Store year and quarter cols
        df['Year'] = int(filename[:4])
        df['Quarter'] = filename[5:7]
        
        # Append DataFrame to the list
        dfs_list.append(df)
        
# Concatenate all DataFrames in the list
final_df = pd.concat(dfs_list, ignore_index=True)

In [9]:
# Get the sector mapping in
final_df=final_df.merge(sector,how='left', left_on='sectors', right_on='code_sector').drop(['sectors','code_sector'],axis=1).rename(columns={'description_sector':'sector'})

In [10]:
# Save the final df
final_df.to_csv('output/final_df.csv',index=False)