# Analysis on the companies.jsonl dataset

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

## Data cleaning

In [14]:
data = []
with open("companies.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]
df = pd.json_normalize(data)
df = df.drop(columns=["div_yield",'weekly_prices', "P1M", "P6M", "P1Y"])

# Takes first element for columns that have lists as values
df = df.applymap(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
df = df.replace("n.a.", np.nan)
df = df.replace("n.s.", np.nan)

# Removing non italian companies
df = df[df["isin"].str.startswith("IT")]
df = df.drop(columns="isin")

df

  df = df.applymap(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
  df = df.replace("n.a.", np.nan)
  df = df.replace("n.s.", np.nan)


Unnamed: 0,name,ticker,market_cap,p_e,sector,sub_sector,revenues,ebitda,profit,assets,...,ros,roa,roe,debt_equity,debt_ebitda,employees,tang_assets,shf_liabilities,working_capital,retained_earnings
0,Alkemy,ALK,6.282000e+07,53.18,Tecnologia,Information technology,1.150370e+08,11918000.0,3535000.0,1.303190e+08,...,5.84,5.29,7.41,0.63,2.53,448.0,7859000.0,0.67,18590000.0,11332000.0
1,Aedes,AEDES,5.280000e+06,7.42,Immobiliare,REIT - Retail,6.800000e+04,-1558000.0,-1580000.0,8.589000e+06,...,,-19.54,-20.21,0.02,-0.08,2.0,729000.0,11.86,7153000.0,3391000.0
2,Amplifon,AMP,6.070000e+09,39.19,Salute,Distribuzione servizi medici,2.260084e+09,526849000.0,155025000.0,3.693215e+09,...,11.42,7.02,14.07,1.07,2.23,14379.0,699669000.0,450.00,-463182000.0,809643000.0
3,Autostrade Merid,AUTME,1.138000e+07,7.03,Beni Industriali,Ingegneria e Costruzione,0.000000e+00,2867000.0,1451000.0,5.955000e+07,...,,4.79,2.94,0.02,0.40,2.0,345000.0,9.22,53564000.0,0.0
6,Brunello Cucinelli,BC,8.880000e+09,78.11,Beni di Consumo Ciclici,Beni di lusso,1.139420e+09,333751000.0,123809000.0,1.379880e+09,...,16.41,13.58,27.29,1.48,2.01,2623.0,724159000.0,0.50,173044000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,Somec,SOM,8.211000e+07,37.17,Beni Industriali,Ingegneria e Costruzione,3.676580e+08,10612000.0,-10374000.0,3.288450e+08,...,-1.90,-2.14,-49.49,7.32,14.47,1015.0,,0.07,-17358000.0,0.0
817,Spindox,SPN,5.467000e+07,100.52,Tecnologia,Information technology,9.681339e+07,6526720.0,254946.0,8.284641e+07,...,2.86,3.45,1.66,1.25,2.93,1300.0,,0.32,18057590.0,6605054.0
818,Star7,STAR7,5.670000e+07,28.65,Beni Industriali,Servizi Aziendali Speciali,1.044056e+08,14550930.0,2231713.0,1.056898e+08,...,6.04,6.02,7.00,0.84,1.85,1092.0,,0.50,19132192.0,0.0
819,Sys-Dat,SYS,1.861000e+08,36.74,Tecnologia,Information technology,2.371700e+07,5197000.0,2383000.0,3.994700e+07,...,16.58,9.87,16.08,0.89,2.53,128.0,,0.73,6808000.0,0.0


## Altman Z-Score
$$
    Z = 1.2 X_1 + 1.4  X_2 + 3.3  X_3 + 0.6  X_4 + 1.0  X_5
$$

In [15]:
# Step 1: Calculate Total Liabilities
df["total_debt"] = df["debt_equity"] * df["sh_funds"]
df["total_liabilities"] = df["total_debt"] + (
    df["assets"] - df["sh_funds"] - df["total_debt"]
)

# Step 2: Compute Altman Z-Score Components
df["x1"] = df["working_capital"] / df["assets"]
df["x2"] = df["sh_funds"] / df["assets"]
df["x3"] = df["ebitda"] / df["assets"]  # Approximating EBIT with EBITDA
df["x4"] = df["market_cap"] / df["total_liabilities"]
df["x5"] = df["revenues"] / df["assets"]

# Step 3: Calculate Altman Z-Score
df["z_score"] = (
    (1.2 * df["x1"])
    + (1.4 * df["x2"])
    + (3.3 * df["x3"])
    + (0.6 * df["x4"])
    + (1.0 * df["x5"])
)

df = df.drop(columns=["x1", "x2", "x3", "x4", "x5"])

df

Unnamed: 0,name,ticker,market_cap,p_e,sector,sub_sector,revenues,ebitda,profit,assets,...,debt_equity,debt_ebitda,employees,tang_assets,shf_liabilities,working_capital,retained_earnings,total_debt,total_liabilities,z_score
0,Alkemy,ALK,6.282000e+07,53.18,Tecnologia,Information technology,1.150370e+08,11918000.0,3535000.0,1.303190e+08,...,0.63,2.53,448.0,7859000.0,0.67,18590000.0,11332000.0,3.006108e+07,8.260300e+07,2.324617
1,Aedes,AEDES,5.280000e+06,7.42,Immobiliare,REIT - Retail,6.800000e+04,-1558000.0,-1580000.0,8.589000e+06,...,0.02,-0.08,2.0,729000.0,11.86,7153000.0,3391000.0,1.563200e+05,7.730000e+05,5.781005
2,Amplifon,AMP,6.070000e+09,39.19,Salute,Distribuzione servizi medici,2.260084e+09,526849000.0,155025000.0,3.693215e+09,...,1.07,2.23,14379.0,699669000.0,450.00,-463182000.0,809643000.0,1.178795e+09,2.591537e+09,2.755175
3,Autostrade Merid,AUTME,1.138000e+07,7.03,Beni Industriali,Ingegneria e Costruzione,0.000000e+00,2867000.0,1451000.0,5.955000e+07,...,0.02,0.40,2.0,345000.0,9.22,53564000.0,0.0,9.875400e+05,1.017300e+07,3.070277
6,Brunello Cucinelli,BC,8.880000e+09,78.11,Beni di Consumo Ciclici,Beni di lusso,1.139420e+09,333751000.0,123809000.0,1.379880e+09,...,1.48,2.01,2623.0,724159000.0,0.50,173044000.0,0.0,6.713472e+08,9.262670e+08,7.986742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,Somec,SOM,8.211000e+07,37.17,Beni Industriali,Ingegneria e Costruzione,3.676580e+08,10612000.0,-10374000.0,3.288450e+08,...,7.32,14.47,1015.0,,0.07,-17358000.0,0.0,1.534345e+08,3.078840e+08,1.410432
817,Spindox,SPN,5.467000e+07,100.52,Tecnologia,Information technology,9.681339e+07,6526720.0,254946.0,8.284641e+07,...,1.25,2.93,1300.0,,0.32,18057590.0,6605054.0,1.918820e+07,6.749586e+07,2.435514
818,Star7,STAR7,5.670000e+07,28.65,Beni Industriali,Servizi Aziendali Speciali,1.044056e+08,14550930.0,2231713.0,1.056898e+08,...,0.84,1.85,1092.0,,0.50,19132192.0,0.0,2.677365e+07,7.381640e+07,2.542485
819,Sys-Dat,SYS,1.861000e+08,36.74,Tecnologia,Information technology,2.371700e+07,5197000.0,2383000.0,3.994700e+07,...,0.89,2.53,128.0,,0.73,6808000.0,0.0,1.319336e+07,2.512300e+07,6.191605


## Computing features
$$
    \text{EBITDA Margin} = \frac{\text{EBITDA}}{\text{Revenues}}
$$
$$
    \text{Net Profit Margin} = \frac{\text{Profit}}{\text{Revenues}}
$$
$$
    \text{Asset turnover} = \frac{\text{Revenues}}{\text{Assets}}
$$

In [16]:
df["ebitda_margin"] = df["ebitda"] / df["revenues"]
df["net_prof_margin"] = df["profit"] / df["revenues"]
df["asset_turnover"] = df["revenues"] / df["assets"]

# Convert only numeric columns to float while preserving strings
df_numeric = df.apply(pd.to_numeric, errors="coerce")  # Convert only numeric values

# Find rows that contain inf
inf_rows = np.isinf(df_numeric.to_numpy()).any(axis=1)

# Remove rows containing inf, keeping string values untouched
df = df.loc[~inf_rows]

df

Unnamed: 0,name,ticker,market_cap,p_e,sector,sub_sector,revenues,ebitda,profit,assets,...,tang_assets,shf_liabilities,working_capital,retained_earnings,total_debt,total_liabilities,z_score,ebitda_margin,net_prof_margin,asset_turnover
0,Alkemy,ALK,6.282000e+07,53.18,Tecnologia,Information technology,1.150370e+08,11918000.0,3535000.0,1.303190e+08,...,7859000.0,0.67,18590000.0,11332000.0,3.006108e+07,8.260300e+07,2.324617,0.103601,0.030729,0.882734
1,Aedes,AEDES,5.280000e+06,7.42,Immobiliare,REIT - Retail,6.800000e+04,-1558000.0,-1580000.0,8.589000e+06,...,729000.0,11.86,7153000.0,3391000.0,1.563200e+05,7.730000e+05,5.781005,-22.911765,-23.235294,0.007917
2,Amplifon,AMP,6.070000e+09,39.19,Salute,Distribuzione servizi medici,2.260084e+09,526849000.0,155025000.0,3.693215e+09,...,699669000.0,450.00,-463182000.0,809643000.0,1.178795e+09,2.591537e+09,2.755175,0.233110,0.068593,0.611956
6,Brunello Cucinelli,BC,8.880000e+09,78.11,Beni di Consumo Ciclici,Beni di lusso,1.139420e+09,333751000.0,123809000.0,1.379880e+09,...,724159000.0,0.50,173044000.0,0.0,6.713472e+08,9.262670e+08,7.986742,0.292913,0.108660,0.825738
11,Digitouch,DGT,2.479000e+07,18.35,Servizi alla Comunicazione,Agenzie Pubblicitarie,4.199700e+07,6901000.0,1994000.0,5.554300e+07,...,2427000.0,0.56,5391000.0,1666000.0,1.715896e+07,3.668700e+07,2.163309,0.164321,0.047480,0.756117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,Somec,SOM,8.211000e+07,37.17,Beni Industriali,Ingegneria e Costruzione,3.676580e+08,10612000.0,-10374000.0,3.288450e+08,...,,0.07,-17358000.0,0.0,1.534345e+08,3.078840e+08,1.410432,0.028864,-0.028216,1.118028
817,Spindox,SPN,5.467000e+07,100.52,Tecnologia,Information technology,9.681339e+07,6526720.0,254946.0,8.284641e+07,...,,0.32,18057590.0,6605054.0,1.918820e+07,6.749586e+07,2.435514,0.067415,0.002633,1.168589
818,Star7,STAR7,5.670000e+07,28.65,Beni Industriali,Servizi Aziendali Speciali,1.044056e+08,14550930.0,2231713.0,1.056898e+08,...,,0.50,19132192.0,0.0,2.677365e+07,7.381640e+07,2.542485,0.139369,0.021375,0.987850
819,Sys-Dat,SYS,1.861000e+08,36.74,Tecnologia,Information technology,2.371700e+07,5197000.0,2383000.0,3.994700e+07,...,,0.73,6808000.0,0.0,1.319336e+07,2.512300e+07,6.191605,0.219126,0.100476,0.593712


## Exporting

In [19]:
df.to_csv("data.csv", index=False)