In [5]:
import pandas as pd
import os

# Folder path containing CSV files
folder_path = "../../data/run_1"

# List to hold data from each CSV file
data_frames = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        df = df.dropna(subset=['CompanyMentioned'], how='any')
        
        # Filter rows where 'BreachMentioned' column has is not false
        filtered_df = df[~df['CompanyMentioned'].isin(["flase", "False"])]
        
        # Append the filtered DataFrame to the list
        data_frames.append(filtered_df)

# Concatenate all filtered DataFrames
combined_df = pd.concat(data_frames, ignore_index=True)

df_deduped = combined_df.loc[combined_df.groupby('URL')['Date'].idxmin()]
df_deduped = df_deduped.loc[df_deduped.groupby(['Headline', 'Publication'])['Date'].idxmin()]

df_deduped = df_deduped.drop_duplicates(subset=['URL'])
df_deduped = df_deduped.drop_duplicates(subset=['Headline', 'Publication'])

# Save the final concatenated DataFrame to a new CSV
df_deduped.to_csv("../../data/analysis/articles_with_company_name.csv", index=False)


In [6]:
import re 

companies_data = pd.read_csv("../../data/companies.csv")
articles_data = pd.read_csv("../../data/analysis/articles_with_company_name.csv")

def preprocess(text):
    # Remove punctuation and convert to lowercase
    if pd.isna(text):
        return ""
    return re.sub(r'[^\w\s]', '', text).lower()

# Preprocess your data
companies_data['name_processed'] = companies_data['name'].apply(preprocess)
articles_data['CompanyMentioned_processed'] = articles_data['CompanyMentioned'].apply(preprocess)

# Perform the join on the lowercase columns
result_df = pd.merge(companies_data, articles_data, left_on='name_processed', right_on='CompanyMentioned_processed', how='inner')

# Drop the helper lowercase columns if desired
result_df = result_df.drop(columns=['name_processed', 'CompanyMentioned_processed'])

# Sort by 'name' (from companies_data)
result_df.sort_values(by=['CompanyMentioned', 'name']).drop_duplicates(subset=['Headline', 'URL'], inplace=True)

result_df.to_csv("../../data/analysis/joined_articles_companies.csv", index=False)

In [7]:
matched = pd.read_csv("../../data/analysis/joined_articles_companies.csv")

# drop duplicate rows by Headline and URL
df_dedup = matched.loc[matched.groupby(['Headline', 'URL'])['total employee estimate'].idxmax()].reset_index(drop=True)

df_dedup.to_csv("../../data/analysis/joined_articles_companies_no_duplicates.csv", index=False)

In [10]:
import pandas as pd
df = pd.read_csv("../../data/analysis/joined_articles_companies_no_duplicates.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})

df.to_csv("../../data/analysis/joined_articles_companies_no_duplicates.csv", index=False)

KeyError: 'sector'

In [11]:
import pandas as pd
df = pd.read_csv("../../data/article_company_breach/article_count_per_breach.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})

df.to_csv("../../data/article_company_breach/article_count_per_breach.csv", index=False)

In [12]:
import pandas as pd
df = pd.read_csv("../../data/article_company_breach/left_joined_articles_company_breaches_no_duplicates.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})

df.to_csv("../../data/article_company_breach/left_joined_articles_company_breaches_no_duplicates.csv", index=False)

In [13]:
import pandas as pd
df = pd.read_csv("../../data/article_company_breach/left_joined_breaches_companies_no_duplicates.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})

df.to_csv("../../data/article_company_breach/left_joined_breaches_companies_no_duplicates.csv", index=False)

In [14]:
import pandas as pd
df = pd.read_csv("../../data/article_data/left_joined_breaches_companies_no_duplicates.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})

df.to_csv("../../data/article_company_breach/left_joined_breaches_companies_no_duplicates.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../../data/article_data/left_joined_breaches_companies_no_duplicates.csv'

In [20]:
df = pd.read_csv("../../data/article_company_breach/left_joined_articles_company_breaches_no_duplicates.csv")

# Get the number of unique values in the 'Name' column
grouped_counts = df.groupby(['organisation', 'records lost', 'year']).size().reset_index(name='row_count')

print("Number of unique names:", grouped_counts)

Number of unique names:     organisation  records lost  year  row_count
0        "Apple"      12367232  2012         14
1          500px      14800000  2019          1
2           8fit      20000000  2019          1
3            AOL       2400000  2014          4
4           AT&T        114000  2010          3
..           ...           ...   ...        ...
375       Zomato      17000000  2017          1
376         Zoom        500000  2020         11
377     db8151dd      22000000  2020          1
378    ssndob.ms       4000000  2013          1
379    uTorrent          35000  2016          1

[380 rows x 4 columns]


In [16]:


import pandas as pd
df = pd.read_csv("../../data/breaches/breaches_information.csv")

df['sector'] = df['sector'].replace({'web ': 'web', 'tech, web': 'web, tech'})
#print(df['industry'].unique())

df.to_csv("../../data/breaches/breaches_information.csv", index=False)