In [4]:
import pandas as pd
from scipy.stats import norm

# Load the saved table
sector_data = pd.read_csv("./output/sector_breach_companies_percentages.csv")

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 0.1
low_percentage_mask = sector_data["percentage_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "industry": "Other",
    "count_all_articles": sector_data.loc[low_percentage_mask, "count_all_articles"].sum(),
    "percentage_all_articles": sector_data.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "count_breach_articles": sector_data.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "percentage_breach_articles": sector_data.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
filtered_sector_data = pd.concat(
    [sector_data.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Calculate the total breach articles count
total_breach_articles = filtered_sector_data["count_breach_articles"].sum()

# Observed and expected proportions in percentages
observed_percentage = filtered_sector_data["percentage_breach_articles"]
expected_percentage = filtered_sector_data["percentage_all_articles"]

# Convert percentages to proportions for calculations
observed_proportion = observed_percentage / 100
expected_proportion = expected_percentage / 100

# Calculate the standard error for proportions
filtered_sector_data["standard_error"] = (
    (expected_proportion * (1 - expected_proportion) / total_breach_articles) ** 0.5
)

# Calculate z-scores
filtered_sector_data["z_score"] = (
    (observed_proportion - expected_proportion) / filtered_sector_data["standard_error"]
)

# Calculate p-values
filtered_sector_data["p_value"] = 2 * (1 - norm.cdf(abs(filtered_sector_data["z_score"])))

# Separate significant and non-significant industries
significance_threshold = 0.05
significant_industries = filtered_sector_data[filtered_sector_data["p_value"] < significance_threshold]
non_significant_industries = filtered_sector_data[filtered_sector_data["p_value"] >= significance_threshold]

# Save the results into separate files
significant_industries.to_csv("./z/significant_industries.csv", index=False)
non_significant_industries.to_csv("./z/non_significant_industries.csv", index=False)

# Print results
print("Significant Industries:")
print(significant_industries[["industry", "z_score", "p_value"]])

print("\nNon-Significant Industries:")
print(non_significant_industries[["industry", "z_score", "p_value"]])

print("\nTables saved to 'significant_industries.csv' and 'non_significant_industries.csv'")


Significant Industries:
                               industry    z_score       p_value
0                              internet   7.327813  2.340350e-13
1                     computer software -20.888279  0.000000e+00
2                hospital & health care -24.556535  0.000000e+00
3                    financial services   5.968999  2.387140e-09
4   information technology and services  -9.391937  0.000000e+00
5                    telecommunications -18.515711  0.000000e+00
7                      higher education -18.462558  0.000000e+00
8                        computer games -17.678375  0.000000e+00
9                     airlines/aviation  -8.197893  2.220446e-16
10                        entertainment   7.406117  1.301181e-13
11            marketing and advertising  25.907941  0.000000e+00
13            government administration  -7.254919  4.019007e-13
14                 consumer electronics  59.716481  0.000000e+00
15                         online media  -6.563292  5.263279e-11
1

In [5]:
from scipy.stats import shapiro

# Example: Check normality for 'percentage_breach_articles'
stat, p_value = shapiro(filtered_sector_data["percentage_breach_articles"])
print("Shapiro-Wilk Test Statistic:", stat)
print("P-value:", p_value)

if p_value > 0.05:
    print("The data appears to be normally distributed (fail to reject H0).")
else:
    print("The data does not appear to be normally distributed (reject H0).")


Shapiro-Wilk Test Statistic: 0.46265022508092146
P-value: 8.99769582528103e-13
The data does not appear to be normally distributed (reject H0).
