# Source from kagglehub

In [1]:
import os
import pandas as pd
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("hussainsheikh03/nlp-based-cyber-security-dataset")

# List all files and subdirectories in the directory
files_and_subdirectories = os.listdir(path)
csv_file_path = os.path.join(path, files_and_subdirectories[0])
df_cybersecurity = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
df_cybersecurity.head()

Unnamed: 0,Threat Category,IOCs (Indicators of Compromise),Threat Actor,Attack Vector,Geographical Location,Sentiment in Forums,Severity Score,Predicted Threat Category,Suggested Defense Mechanism,Risk Level Prediction,Cleaned Threat Description,Keyword Extraction,Named Entities (NER),Topic Modeling Labels,Word Count
0,DDoS,['123.456.78.9'],APT-28,Email,North Korea,0.99,5,DDoS,Increase Web Security,4,ransomware attack through network vulnerability,"['malware', 'email', 'attachment']","['CompanyY', 'Malware']",Malware,47
1,Malware,"['10.0.0.2', 'infected.exe']",Lazarus Group,Network,USA,0.97,1,DDoS,Quarantine,4,phishing email with malicious link,"['DDoS', 'website', 'attack']","['Unknown', 'Phishing Service']",Phishing,31
2,Phishing,"['malwarehash123', 'trojan.com']",APT-28,Email,North Korea,0.87,3,Phishing,Quarantine,2,ransomware attack through network vulnerability,"['phishing', 'corporate', 'scam']","['CompanyY', 'Malware']",Malware,24
3,DDoS,"['192.168.1.1', 'malicious.com']",Lazarus Group,Web,Global,0.64,2,Phishing,Quarantine,1,phishing email with malicious link,"['DDoS', 'website', 'attack']","['CompanyY', 'Malware']",DDoS,46
4,DDoS,"['malwarehash123', 'trojan.com']",Unknown,Email,Germany,0.57,1,DDoS,Patch Vulnerability,1,phishing scam targeting corporate accounts,"['DDoS', 'website', 'attack']","['Lazarus Group', 'Network Device']",DDoS,24


# Source from Wb

In [3]:
import wbdata

country_name_mapping = {
    "USA": "US",
    "Russia": "RU",
    "Germany": "DE",
    "North Korea": "KP",
    "Global": "1W",

    'United States': 'US',
    'Russian Federation': 'RU',
    'Germany': 'DE',
    "Korea, Dem. People's Rep.": 'KP',
    'World': '1W'
}
    

indicator = {
    "NY.GDP.PCAP.CD": "GDP_per_capita",
}

df_cybersecurity['Standardized Country'] = df_cybersecurity['Geographical Location'].map(lambda x: country_name_mapping.get(x, {}))
countries = df_cybersecurity['Standardized Country'].unique()
data_countries = wbdata.get_dataframe(indicator, country=countries)
data_countries['GDP_per_capita'] = data_countries['GDP_per_capita'].fillna(0)
max_gdp_per_year = data_countries.groupby('country')['GDP_per_capita'].max().reset_index()
max_gdp_per_year.rename(columns={'country': 'Country Name'}, inplace=True)
max_gdp_per_year['country'] = max_gdp_per_year['Country Name'].map(lambda x: country_name_mapping.get(x, {}))

max_gdp_per_year

# Function to obtain income levels
def get_income_levels(selected_countries):
    # Obtener todos los países desde la API
    countries_list = list(wbdata.get_countries())
    
    # Create an income level dictionary
    income_levels = {}
    for country in countries_list:
        iso2Code = country['iso2Code']
        incomeLevel = country['incomeLevel']['value']
        income_levels[iso2Code] = incomeLevel
    
    # Filter the selected countries
    filtered_income_levels = {
        country: income_levels.get(country, "Not Available")
        for country in selected_countries
    }
    
    #Convert to DataFrame for better display
    return pd.DataFrame(
        list(filtered_income_levels.items()), 
        columns=["country", "Income Level"]
    )

income_levels_df = get_income_levels(countries)


result = pd.merge(max_gdp_per_year, income_levels_df, on='country', how='inner')

result


Unnamed: 0,Country Name,GDP_per_capita,country,Income Level
0,Germany,52745.755706,DE,High income
1,"Korea, Dem. People's Rep.",0.0,KP,Low income
2,Russian Federation,15941.448242,RU,High income
3,United States,81695.187071,US,High income
4,World,13138.327546,1W,Aggregates


# Correlation Analysis

### Grouping the data 

In [6]:
grouped_cybersecurity = df_cybersecurity.groupby('Standardized Country').agg({
    'Severity Score': 'mean',
    'Risk Level Prediction': 'mean',
    'Attack Vector': 'count'
}).reset_index()

final_data = grouped_cybersecurity.merge(result, left_on='Standardized Country', right_on='country')

final_data


Unnamed: 0,Standardized Country,Severity Score,Risk Level Prediction,Attack Vector,Country Name,GDP_per_capita,country,Income Level
0,1W,2.906383,2.931915,235,World,13138.327546,1W,Aggregates
1,DE,3.022422,2.950673,223,Germany,52745.755706,DE,High income
2,KP,3.033493,2.808612,209,"Korea, Dem. People's Rep.",0.0,KP,Low income
3,RU,3.137441,2.895735,211,Russian Federation,15941.448242,RU,High income
4,US,2.864865,2.923423,222,United States,81695.187071,US,High income


### Correlation analysis

In [29]:
# Correlation between variables
correlation_gdp_severity = final_data['GDP_per_capita'].corr(final_data['Severity Score'])
correlation_income_severity = final_data['Income Level'].apply(lambda x: 1 if x == 'High income' else 0).corr(final_data['Severity Score'])
correlation_gdp_risk = final_data['GDP_per_capita'].corr(final_data['Risk Level Prediction'])
correlation_income_risk = final_data['Income Level'].apply(lambda x: 1 if x == 'High income' else 0).corr(final_data['Risk Level Prediction'])


In [31]:
correlation_gdp_severity

-0.5284244821417033

In [33]:
correlation_income_severity

0.19295577181792917

In [35]:
correlation_gdp_risk

0.611213570776033

In [37]:
correlation_income_risk

0.5197675213852708

In [57]:
from sklearn.linear_model import LinearRegression

# Preparing the data
X = final_data[['Income Level','Risk Level Prediction']]
# Convertimos Income Level a una variable binaria si es necesario
X['Income Level'] = X['Income Level'].apply(lambda x: 1 if x == 'High income' else 0)
y = final_data['Severity Score']

# Create and train the model
model = LinearRegression()
model.fit(X, y)

# coefficients
print(f"coefficients: {model.coef_}")


coefficients: [ 0.10246969 -1.21036228]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Income Level'] = X['Income Level'].apply(lambda x: 1 if x == 'High income' else 0)
