In [96]:


import pandas as pd
import arff
import os 
import re
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [97]:
# Import metadata 
dimension=pd.read_csv('dimension/dimension.csv',delimiter=';')
sector=pd.read_csv('dimension/sector_dimension.csv')

In [98]:
# Clean the .arff file:
# Step 1: Clean random letters appearing and convert them to m
# Step 2: Clean the letter m in the Sector column which should be numeric,
# we know it`s the last column, convert m to ?
# which is the null value for numeric attributes in .arff files
# Step 3: Remove rows where the country Italy appears inb the first column

for filename in os.listdir('data/'):# Loop through each file in the directory
    with open(f'data/{filename}', 'r') as rf: # Open the .arff file
        # Read lines from the file
        lines = rf.readlines()

    for i, line in enumerate(lines):
        
        # Step 1: 
        # Clean random letters appearing and convert them to m
        pattern = r',([a-z]),'# Regex to match comma + single lowercase letter + comma
        # Perform the substitution
        lines[i] = re.sub(pattern, lambda x: ',m,', line)
        
        # Step 2:
        # If there is an m in the last three characters
        if 'm' in line[-3:]:
            # If yes, replace 'm' with '?' in the last 3 characters
            lines[i] = line[:-3] + line[-3:].replace('m', '?')
    
        # Step 3:
        # Split the string by commas
        string_elements = line.split(',')
        # Get the first value
        first_value = string_elements[0]

        # Check if the first value is Italy
        if first_value == 'Italy':
            # If not, mark the line with 'Remove'
            lines[i] = 'Remove'
            
    # Let`s remove all the lines=='Remove'
    lines = [line for line in lines if line != 'Remove']     
    
    # Write modified lines back to a new ARFF file
    with open(f'data_modified/{filename}', 'w') as wf:
        wf.writelines(lines)

In [99]:
# Import financial data

# Initialize an empty list to store DataFrames
dfs_list = []

for filename in os.listdir('data_modified/'):# Loop through each file in the directory
    with open(f'data_modified/{filename}', 'r') as f: # Open the .arff file
    
        raw_data = arff.load(f) # dict obj
        # Store arff in a pandas df
        df = pd.DataFrame(raw_data['data'],columns=[x[0] for x in raw_data['attributes']])
        
        # Create a dict with the mappings
        mappings = dict(zip(dimension['Variable Name'], dimension['Description']))
        # Rename columns in df using the mapping
        df.rename(columns=mappings,inplace=True)

        # Store year and quarter cols
        df['Year'] = int(filename[:4])
        df['Quarter'] = filename[5:7]
        
        # Append DataFrame to the list
        dfs_list.append(df)
        
# Concatenate all DataFrames in the list
final_df = pd.concat(dfs_list, ignore_index=True)

In [100]:
# Get the sector mapping in
final_df=final_df.merge(sector,how='left', left_on='sectors', right_on='code_sector').drop(['sectors'],axis=1)

In [101]:
# Handle missing values
final_df.replace('m', None, inplace=True)
# Remove rows where more than threshold percent of cols are null
threshold = int(0.7 * len(final_df.columns))
final_df=final_df.dropna(thresh=threshold).reset_index(drop=True)

In [102]:
# Extract numeric part from 'Quarter' column and convert to float
final_df['Quarter'] = final_df['Quarter'].str.extract('(\d+)').astype(float)

# Sort df by year and quarter
final_df=final_df.sort_values(['Year','Quarter'])

# Convert numeric cols to float
final_df = pd.concat([final_df[['Country','description_sector']], 
                final_df.drop(['Country','description_sector'],
                        axis=1).astype(float)], axis=1)

In [103]:
# Encode Quarter to preserve its cyclic nature (useful for ml tasks)

# Define the period for the trigonometric encoding (4 for quarters in a year)
period = 4
# Apply trigonometric coding
final_df['sin_quarter'] = np.sin(2 * np.pi * final_df['Quarter'] / period)
final_df['cos_quarter'] = np.cos(2 * np.pi * final_df['Quarter'] / period)

In [104]:
# Create a df for task 1

# I dont know if this makes financial sense but I will remove massive outliers because they seem off
def remove_outliers_iqr(df,threshold):
    """
    Remove outliers from each column of a DataFrame using the Interquartile Range (IQR) method.
    """
    df_no_outliers = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
    
    # Select columns with float dtype
    float_cols = df.select_dtypes(include=['float']).columns

    # Iterate over each float column
    for column in df[float_cols].columns:
        # Calculate the first quartile (Q1) and third quartile (Q3) for the column
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        
        # Calculate the IQR for the column
        IQR = Q3 - Q1
        
        # Define the lower and upper bounds for outliers for the column
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        # Replace outliers with None for the column
        df_no_outliers[column] = df[column].mask((df[column] < lower_bound) | (df[column] > upper_bound), other=None)
    
    return df_no_outliers

df_task1 = remove_outliers_iqr(final_df,4)

In [105]:
# Inpute missing data with interpolation as it is time series data

# Select columns with float dtype
float_cols = df_task1.select_dtypes(include=['float']).columns

# Interpolate only the float columns
df_task1[float_cols] = df_task1[float_cols].interpolate(method='linear')

In [106]:
# Inpute the remaining missing values (due to missing from the start or from the end of df)
imputer = KNNImputer(n_neighbors=5)
df_task1[float_cols] = imputer.fit_transform(df_task1[float_cols])

In [109]:
# As we are going to use linear regression, let s normalize the data
scaler = StandardScaler()
financial_cols=df_task1.drop(['Country', 'description_sector','Year', 'Quarter',
       'code_sector', 'sin_quarter', 'cos_quarter'],axis=1).columns	

# Normalize
df_task1[financial_cols] = scaler.fit_transform(df_task1[financial_cols])

In [110]:
# Save the df_task1
df_task1.to_csv('output/df_task1.csv',index=False)