## Industry

In [62]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
sector_data = pd.read_csv("./output/sector_article_percentages.csv")

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 0.1
low_percentage_mask = sector_data["percentage_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "industry": "Other",
    "count_all_articles": sector_data.loc[low_percentage_mask, "count_all_articles"].sum(),
    "percentage_all_articles": sector_data.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "count_breach_articles": sector_data.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "percentage_breach_articles": sector_data.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
filtered_sector_data = pd.concat(
    [sector_data.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Observed (count_breach_articles) and Expected (count_all_articles)
observed = filtered_sector_data["count_breach_articles"]
expected = filtered_sector_data["percentage_all_articles"]

# Ensure observed and expected have matching totals
expected = (expected / 100) * observed.sum()


# Perform the chi-squared goodness-of-fit test with normalized expected counts
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)



# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_data.to_csv("filtered_sector_data.csv", index=False)
print("Filtered table saved to 'filtered_sector_data.csv'")


Chi-squared Statistic: 4020.164136772257
P-value: 0.0
Filtered table saved to 'filtered_sector_data.csv'


In [58]:
# importing packages
import scipy.stats as stats
import numpy as np

# no of hours a student studies
# in a week vs expected no of hours
observed_data = [8, 6, 10, 7, 8, 11, 9]
expected_data = [9, 8, 11, 8, 10, 7, 6]

print(sum(observed_data))
print(sum(expected_data))


# Chi-Square Goodness of Fit Test
chi_square_test_statistic, p_value = stats.chisquare(
    observed_data, expected_data)

# chi square test statistic and p value
print('chi_square_test_statistic is : ' +
      str(chi_square_test_statistic))
print('p_value : ' + str(p_value))


# find Chi-Square critical value
print(stats.chi2.ppf(1-0.05, df=6))

59
59
chi_square_test_statistic is : 5.0127344877344875
p_value : 0.542180861413329
12.591587243743977


In [66]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
sector_percentages = pd.read_csv("./output/sector_article_percentages.csv")

# Calculate articles that do not contain data breaches
sector_percentages["count_non_breach_articles"] = (
    sector_percentages["count_all_articles"] - sector_percentages["count_breach_articles"]
)

# Calculate percentages for non-breach articles
sector_percentages["percentage_non_breach_articles"] = (
    sector_percentages["count_non_breach_articles"] /
    sector_percentages["count_non_breach_articles"].sum() * 100
)

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 0.1
low_percentage_mask = sector_percentages["percentage_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "industry": "Other",
    "count_all_articles": sector_percentages.loc[low_percentage_mask, "count_all_articles"].sum(),
    "count_breach_articles": sector_percentages.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "count_non_breach_articles": sector_percentages.loc[low_percentage_mask, "count_non_breach_articles"].sum(),
    "percentage_all_articles": sector_percentages.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "percentage_breach_articles": sector_percentages.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
    "percentage_non_breach_articles": sector_percentages.loc[low_percentage_mask, "percentage_non_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
filtered_sector_percentages = pd.concat(
    [sector_percentages.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Observed (count_breach_articles) and Expected (count_all_articles)
observed = filtered_sector_percentages["count_breach_articles"]
expected = filtered_sector_percentages["percentage_non_breach_articles"]

# Ensure observed and expected have matching totals
expected = (expected / 100) * observed.sum()

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("filtered_sector_percentages_with_non_breach.csv", index=False)
print("Filtered table saved to 'filtered_sector_percentages_with_non_breach.csv'")


Chi-squared Statistic: 7249.712916586258
P-value: 0.0
Filtered table saved to 'filtered_sector_percentages_with_non_breach.csv'


## year founded

In [35]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
year_founded_percentages = pd.read_csv("./output/year_founded_article_percentages.csv")


# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = year_founded_percentages["percentage_breach_articles"]
expected = year_founded_percentages["percentage_all_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
year_founded_percentages.to_csv("filtered_year_founded_percentages.csv", index=False)
print("Filtered table saved to 'filtered_year_founded_percentages.csv'")


Chi-squared Statistic: 24.19254958244882
P-value: 0.9908368496212323
Filtered table saved to 'filtered_year_founded_percentages.csv'


In [None]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
year_founded_percentages = pd.read_csv("./output/year_founded_article_percentages.csv")

# Calculate articles that do not contain data breaches
year_founded_percentages["count_non_breach_articles"] = (
    year_founded_percentages["count_all_articles"] - year_founded_percentages["count_breach_articles"]
)

# Calculate percentages for non-breach articles
year_founded_percentages["percentage_non_breach_articles"] = (
    year_founded_percentages["count_non_breach_articles"] /
    year_founded_percentages["count_non_breach_articles"].sum() * 100
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = year_founded_percentages["percentage_breach_articles"]
expected = year_founded_percentages["percentage_non_breach_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("year_founded_percentages_with_non_breach.csv", index=False)
print("Filtered table saved to 'year_founded_percentages_with_non_breach.csv'")


Chi-squared Statistic: 39.0721188511853
P-value: 0.6423164631321804
Filtered table saved to 'year_founded_percentages_with_non_breach.csv'


## size range

In [None]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
size_percentages = pd.read_csv("./output/size_article_percentages.csv")


# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = size_percentages["percentage_breach_articles"]
expected = size_percentages["percentage_all_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("size_percentages.csv", index=False)
print("Filtered table saved to 'size_percentages.csv'")


Chi-squared Statistic: 4.110978507505424
P-value: 0.7669098346909954
Filtered table saved to 'size_percentages.csv'


In [9]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
size_percentages = pd.read_csv("./output/size_article_percentages.csv")


# Calculate articles that do not contain data breaches
size_percentages["count_non_breach_articles"] = (
    size_percentages["count_all_articles"] - size_percentages["count_breach_articles"]
)

# Calculate percentages for non-breach articles
size_percentages["percentage_non_breach_articles"] = (
    size_percentages["count_non_breach_articles"] /
    size_percentages["count_non_breach_articles"].sum() * 100
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = size_percentages["percentage_breach_articles"]
expected = size_percentages["percentage_non_breach_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("size_percentages_with_non_breach.csv", index=False)
print("Filtered table saved to 'size_percentages_with_non_breach.csv'")


Chi-squared Statistic: 5.099781552600863
P-value: 0.6477877377168725
Filtered table saved to 'size_percentages_with_non_breach.csv'


## total employee estimate range

In [13]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
employee_percentages = pd.read_csv("./output/employees_article_percentages.csv")


# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = employee_percentages["percentage_breach_articles"]
expected = employee_percentages["percentage_all_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("employee_percentages.csv", index=False)
print("Filtered table saved to 'employee_percentages.csv'")


Chi-squared Statistic: 4.4659597981345796
P-value: 0.3466028712230791
Filtered table saved to 'employee_percentages.csv'


In [11]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
employee_percentages = pd.read_csv("./output/employees_article_percentages.csv")


# Calculate articles that do not contain data breaches
employee_percentages["count_non_breach_articles"] = (
    employee_percentages["count_all_articles"] - employee_percentages["count_breach_articles"]
)

# Calculate percentages for non-breach articles
employee_percentages["percentage_non_breach_articles"] = (
    employee_percentages["count_non_breach_articles"] /
    employee_percentages["count_non_breach_articles"].sum() * 100
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = employee_percentages["percentage_breach_articles"]
expected = employee_percentages["percentage_non_breach_articles"]

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("employee_percentages_with_non_breach.csv", index=False)
print("Filtered table saved to 'employee_percentages_with_non_breach.csv'")


Chi-squared Statistic: 5.325631909589469
P-value: 0.2554870088417494
Filtered table saved to 'employee_percentages_with_non_breach.csv'


# data breaches and articles about data breaches

## total employee estimate range

In [None]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
employee_percentages = pd.read_csv("./output/employees_breach_companies_percentages.csv")


# Observed (count_breach_articles) and Expected (count_all_articles)
observed = employee_percentages["percentage_breach_articles"]
expected = employee_percentages["percentage_all_articles"]

sum = employee_percentages["count_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum


chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)


# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
employee_percentages.to_csv("./output/breach_comp_employee_percentages.csv", index=False)



Chi-squared Statistic: 8771.553612876582
P-value: 0.0


In [81]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
employee_percentages = pd.read_csv("./output/employees_breach_companies_percentages.csv")

# Calculate articles that do not contain data breaches
employee_percentages["count_non_breach_articles"] = (
    employee_percentages["count_breach_articles"] - employee_percentages["count_all_articles"]
)

# Calculate percentages for non-breach articles
employee_percentages["percentage_non_breach_articles"] = (
    employee_percentages["count_non_breach_articles"] /
    employee_percentages["count_non_breach_articles"].sum() * 100
)


# Observed (count_breach_articles) and Expected (count_all_articles)
observed = employee_percentages["percentage_non_breach_articles"]
expected = employee_percentages["percentage_all_articles"]

sum = employee_percentages["count_non_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)


# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
employee_percentages.to_csv("./output/breach_comp_employee_percentages_distinct_sets.csv", index=False)



Chi-squared Statistic: 8934.31458529819
P-value: 0.0


## size range

In [82]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
size_percentages = pd.read_csv("./output/size_breach_companies_percentages.csv")


# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = size_percentages["percentage_breach_articles"]
expected = size_percentages["percentage_all_articles"]

sum = employee_percentages["count_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
size_percentages.to_csv("breach_comp_size_percentages.csv", index=False)
print("Filtered table saved to 'size_percentages.csv'")


Chi-squared Statistic: 5970.192855967077
P-value: 0.0
Filtered table saved to 'size_percentages.csv'


In [83]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
size_percentages = pd.read_csv("./output/size_breach_companies_percentages.csv")


# Calculate articles that do not contain data breaches
size_percentages["count_non_breach_articles"] = (
    size_percentages["count_breach_articles"] - size_percentages["count_all_articles"]
)

# Calculate percentages for non-breach articles
size_percentages["percentage_non_breach_articles"] = (
    size_percentages["count_non_breach_articles"] /
    size_percentages["count_non_breach_articles"].sum() * 100
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = size_percentages["percentage_non_breach_articles"]
expected = size_percentages["percentage_all_articles"]

sum = employee_percentages["count_non_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
size_percentages.to_csv("breach_comp_size_percentages_distinct_sets.csv", index=False)
print("Filtered table saved to 'size_percentages_with_non_breach.csv'")


Chi-squared Statistic: 6080.9730481277065
P-value: 0.0
Filtered table saved to 'size_percentages_with_non_breach.csv'


## sector

In [84]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
sector_percentages = pd.read_csv("./output/sector_breach_companies_percentages.csv")

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 5
low_percentage_mask = sector_percentages["count_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "industry": "Other",
    "count_all_articles": sector_percentages.loc[low_percentage_mask, "count_all_articles"].sum(),
    "percentage_all_articles": sector_percentages.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "count_breach_articles": sector_percentages.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "percentage_breach_articles": sector_percentages.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
filtered_sector_percentages = pd.concat(
    [sector_percentages.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = filtered_sector_percentages["percentage_breach_articles"]
expected = filtered_sector_percentages["percentage_all_articles"]

sum = employee_percentages["count_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("./output/breach_comp_sector_percentages.csv", index=False)


Chi-squared Statistic: 3808.6459587667177
P-value: 0.0


In [85]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
sector_percentages = pd.read_csv("./output/sector_breach_companies_percentages.csv")

# Calculate articles that do not contain data breaches
sector_percentages["count_non_breach_articles"] = (
    sector_percentages["count_breach_articles"] - sector_percentages["count_all_articles"]
)

# Calculate percentages for non-breach articles
sector_percentages["percentage_non_breach_articles"] = (
    sector_percentages["count_non_breach_articles"] /
    sector_percentages["count_non_breach_articles"].sum() * 100
)

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 5
low_percentage_mask = sector_percentages["count_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "industry": "Other",
    "count_all_articles": sector_percentages.loc[low_percentage_mask, "count_all_articles"].sum(),
    "count_breach_articles": sector_percentages.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "count_non_breach_articles": sector_percentages.loc[low_percentage_mask, "count_non_breach_articles"].sum(),
    "percentage_all_articles": sector_percentages.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "percentage_breach_articles": sector_percentages.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
    "percentage_non_breach_articles": sector_percentages.loc[low_percentage_mask, "percentage_non_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
filtered_sector_percentages = pd.concat(
    [sector_percentages.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = filtered_sector_percentages["percentage_non_breach_articles"]
expected = filtered_sector_percentages["percentage_all_articles"]

sum = employee_percentages["count_non_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table 
filtered_sector_percentages.to_csv("./output/breach_comp_sector_percentages_distinct_sets.csv", index=False)


Chi-squared Statistic: 3879.31869006566
P-value: 0.0


## year founded

In [86]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
year_founded_percentages = pd.read_csv("./output/year_founded_breach_companies_percentages.csv")

# Combine rows where 'percentage_all_articles' is < 0.1 into an "Other" category
threshold = 5
low_percentage_mask = year_founded_percentages["count_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "year_bin": "Other",
    "count_all_articles": year_founded_percentages.loc[low_percentage_mask, "count_all_articles"].sum(),
    "count_breach_articles": year_founded_percentages.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "percentage_all_articles": year_founded_percentages.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "percentage_breach_articles": year_founded_percentages.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
year_founded_percentages = pd.concat(
    [year_founded_percentages.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Observed (percentage_breach_articles) and Expected (percentage_all_articles)
observed = year_founded_percentages["percentage_breach_articles"]
expected = year_founded_percentages["percentage_all_articles"]

sum = employee_percentages["count_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table /breach_comp_sector_percentages_distinct_sets
year_founded_percentages.to_csv("./output/breach_comp_year_founded_percentages.csv", index=False)
print("Filtered table saved to 'filtered_year_founded_percentages.csv'")


Chi-squared Statistic: 4929.186514931415
P-value: 0.0
Filtered table saved to 'filtered_year_founded_percentages.csv'


In [87]:
import pandas as pd
from scipy.stats import chisquare

# Load the saved table
year_founded_percentages = pd.read_csv("./output/year_founded_breach_companies_percentages.csv")

# Calculate articles that do not contain data breaches
year_founded_percentages["count_non_breach_articles"] = (
    year_founded_percentages["count_breach_articles"] -year_founded_percentages["count_all_articles"]
)

# Calculate percentages for non-breach articles
year_founded_percentages["percentage_non_breach_articles"] = (
    year_founded_percentages["count_non_breach_articles"] /
    year_founded_percentages["count_non_breach_articles"].sum() * 100
)

threshold = 5
low_percentage_mask = year_founded_percentages["count_all_articles"] < threshold

# Summing rows below the threshold into an "Other" category
other_row = pd.DataFrame([{
    "year_bin": "Other",
    "count_all_articles": year_founded_percentages.loc[low_percentage_mask, "count_all_articles"].sum(),
    "count_breach_articles": year_founded_percentages.loc[low_percentage_mask, "count_breach_articles"].sum(),
    "count_non_breach_articles": year_founded_percentages.loc[low_percentage_mask, "count_non_breach_articles"].sum(),
    "percentage_all_articles": year_founded_percentages.loc[low_percentage_mask, "percentage_all_articles"].sum(),
    "percentage_breach_articles": year_founded_percentages.loc[low_percentage_mask, "percentage_breach_articles"].sum(),
    "percentage_non_breach_articles": year_founded_percentages.loc[low_percentage_mask, "percentage_non_breach_articles"].sum(),
}])

# Filter out the rows below the threshold and append the "Other" row using pd.concat
year_founded_percentages = pd.concat(
    [year_founded_percentages.loc[~low_percentage_mask], other_row],
    ignore_index=True
)

# Calculate percentages for non-breach articles
year_founded_percentages["percentage_non_breach_articles"] = (
    year_founded_percentages["count_non_breach_articles"] /
    year_founded_percentages["count_non_breach_articles"].sum() * 100
)

# Observed (percentage_breach_articles) and Expected (percentage_non_breach_articles)
observed = year_founded_percentages["percentage_non_breach_articles"]
expected = year_founded_percentages["percentage_all_articles"]

sum = employee_percentages["count_non_breach_articles"].sum()

# Ensure observed and expected have matching totals
expected = (expected / 100) * sum
observed = (observed / 100) * sum

# Perform the chi-squared goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Display results
print("Chi-squared Statistic:", chi2_stat)
print("P-value:", p_value)

# Save the processed table
filtered_sector_percentages.to_csv("year_founded_percentages_with_non_breach.csv", index=False)
print("Filtered table saved to 'year_founded_percentages_with_non_breach.csv'")


Chi-squared Statistic: 5016.174205179989
P-value: 0.0
Filtered table saved to 'year_founded_percentages_with_non_breach.csv'
