In [1]:
import requests
import pandas as pd
from joblib import Parallel, delayed
import time
import networkx as nx
from bs4 import BeautifulSoup
import pycountry
import re
from itertools import product
import ast 
import os
import sys

In [2]:
base_dir = '/home/reutme/Big_data/final_project'
code_dir = os.path.join(base_dir, 'code')
sys.path.append(code_dir)
import const
import params

## Wikipedia

In [38]:

# Fetch the Wikipedia page URL
url = "https://en.wikipedia.org/wiki/List_of_wars_by_death_toll"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find the first wikitable
tables = soup.find_all('table', {'class': 'wikitable'})
war_table = tables[0]  # First table contains the war data

# Extract rows from the table
rows = war_table.find_all('tr')
data = []

# Loop through rows and extract required information
for row in rows[1:]:  # Skip the header
    cols = row.find_all('td')
    if len(cols) >= 5:  # Ensure enough columns exist
        war_name = cols[0].text.strip()
        death_range = cols[1].text.strip()
        date = cols[2].text.strip()
        combatants = cols[3].text.strip()
        location = cols[4].text.strip()
        data.append([date, war_name, combatants, location])

# Create DataFrame
df = pd.DataFrame(data, columns=["Date", "War Name", "Combatants", "Location"])

# Display DataFrame
print(df.shape)
df.head()

(119, 5)


Unnamed: 0,Date,War Name,Combatants,Location,Country Codes
0,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,
1,1207–1405,Mongol invasions and conquests,Mongol Empire vs. various states in Eurasia,Asia and Europe,
2,220–280,Three Kingdoms,Multiple sides,China,
3,1850–1864,Taiping Rebellion,Qing Dynasty vs. Taiping Heavenly Kingdom,China,
4,1914–1918,World War I,Allied Powers vs. Central Powers,Global,


In [43]:
def parse_years(date_str):
    years = []
    ranges = re.findall(r'(\d{4})(?:-(\d{4}))?', date_str)
    if ranges:
        if len(ranges) > 1:
            start = int(ranges[0][0]) 
            end = int(ranges[1][0])
            year_start = min(start, end)
            year_end = max(start, end)
            years = [year for year in range(int(year_start), int(year_end)+1)]
        else:
            years = [int(ranges[0][0])]

    else: 
        years = []

    return years

# Parse years and filter rows for wars after 1900
df['Date'].apply(parse_years)

0             [1939, 1940, 1941, 1942, 1943, 1944, 1945]
1      [1207, 1208, 1209, 1210, 1211, 1212, 1213, 121...
2                                                     []
3      [1850, 1851, 1852, 1853, 1854, 1855, 1856, 185...
4                         [1914, 1915, 1916, 1917, 1918]
                             ...                        
114                                         [1904, 1905]
115                                               [1948]
116                                               [2023]
117    [1992, 1993, 1994, 1995, 1996, 1997, 1998, 199...
118    [1975, 1976, 1977, 1978, 1979, 1980, 1981, 198...
Name: Date, Length: 119, dtype: object

In [3]:
df = pd.read_csv(os.path.join(const.data_base_dir,"war_table.csv"))

In [44]:
df['List of War Years'] = df['Date'].apply(parse_years)

In [4]:
df.head()

Unnamed: 0,Date,War Name,Combatants,Location,List of War Years
0,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"[1939, 1940, 1941, 1942, 1943, 1944, 1945]"
1,1207–1405,Mongol invasions and conquests,Mongol Empire vs. various states in Eurasia,Asia and Europe,"[1207, 1208, 1209, 1210, 1211, 1212, 1213, 121..."
2,220–280,Three Kingdoms,Multiple sides,China,[]
3,1850–1864,Taiping Rebellion,Qing Dynasty vs. Taiping Heavenly Kingdom,China,"[1850, 1851, 1852, 1853, 1854, 1855, 1856, 185..."
4,1914–1918,World War I,Allied Powers vs. Central Powers,Global,"[1914, 1915, 1916, 1917, 1918]"


In [49]:
wars_data_file = os.path.join(const.data_base_dir,"war_table.csv")
df.to_csv(wars_data_file, index=False)

### Preprocess the wars participants data frame

In [50]:
wars_df = pd.read_csv(wars_data_file)

First I need to save the wars name as txt file

In [None]:
# Specify the file name
file_name = "war_names.txt"

# Open the file in write mode and write each item on a new line
with open(file_name, "w") as file:
    for item in wars_df['War Name'].to_list():
        file.write(f"{item}\n")

print(f"The list has been successfully exported to {file_name}.")

Based on the wars names I extracted war countries participants using GPT4o

In [83]:
original_participants_war_file = os.path.join(const.data_base_dir,"complete_war_names_and_countries.csv")

In [84]:
war_participants_df = pd.read_csv(original_participants_war_file)

In [85]:
war_participants_df.head(2)

Unnamed: 0,War Name,Countries Participants
0,1. **World War II,"United States, Soviet Union, United Kingdom, C..."
1,2. **Mongol invasions and conquests,"Mongolia, China, Russia."


clean the data frame

In [86]:
# Clean the War Name column
war_participants_df['War Name'] = war_participants_df['War Name'].str.replace('^\d+\.\s*\*+\s*', '', regex=True)
# Remove trailing periods from Countries Participants
war_participants_df['Countries Participants'] = war_participants_df['Countries Participants'].str.rstrip('.')

In [89]:
print(war_participants_df.shape)
war_participants_df.head(2)


(118, 2)


Unnamed: 0,War Name,Countries Participants
0,World War II,"United States, Soviet Union, United Kingdom, C..."
1,Mongol invasions and conquests,"Mongolia, China, Russia"


In [90]:
participants_war_file = os.path.join(const.data_base_dir, "process_war_names_and_countries.csv")
war_participants_df.to_csv(participants_war_file, index=False)

### Merge war data frame

In [2]:
participants_war_file =  os.path.join(const.data_base_dir, "process_war_names_and_countries.csv")
wars_data_file = os.path.join(const.data_base_dir, "war_table.csv")

war_participants_df = pd.read_csv(participants_war_file)
wars_df = pd.read_csv(wars_data_file)

In [3]:
print(war_participants_df.head(2))
print("\n\n\n")
print(wars_df.head(2))

                         War Name  \
0                    World War II   
1  Mongol invasions and conquests   

                              Countries Participants  
0  United States, Soviet Union, United Kingdom, C...  
1                            Mongolia, China, Russia  




        Date                        War Name  \
0  1939–1945                    World War II   
1  1207–1405  Mongol invasions and conquests   

                                    Combatants         Location  \
0                Allied Powers vs. Axis Powers           Global   
1  Mongol Empire vs. various states in Eurasia  Asia and Europe   

                                   List of War Years  
0         [1939, 1940, 1941, 1942, 1943, 1944, 1945]  
1  [1207, 1208, 1209, 1210, 1211, 1212, 1213, 121...  


In [4]:
# Merge the datasets based on the War Name
war_merged_data = pd.merge(wars_df, war_participants_df, on='War Name', how='left')

In [5]:
war_merged_data = war_merged_data.dropna()

### Add countries participants country code

#### Check countries without country code

In [6]:
# Function to convert participant names to country codes
def get_country_codes(idx, participants):
    countries = []
    nan_country_code = []
    for country_name in participants.split(","):
        country_name = country_name.strip()
        country = pycountry.countries.get(name=country_name)
        if country:
            countries.append(country.alpha_3)
        else:
            print(idx, country_name)
            nan_country_code.append(country_name)
    return ", ".join(countries), nan_country_code

In [7]:
countries_code_list = []
all_nan_country_code = []
for idx, row in war_merged_data.iterrows():
    countries_code, nan_country_code = get_country_codes(idx, row["Countries Participants"])
    all_nan_country_code.extend(nan_country_code)


0 Soviet Union
1 Russia
4 Russia
4 Turkey
6 Iran
11 Russia
13 England
14 Britain
14 Prussia
14 Russia
19 Vietnam
21 Russia
23 North Korea
23 South Korea
24 England
25 Russia
31 Russia
33 Vietnam
36 Turkey
38 Britain
38 Prussia
38 Russia
43 Britain
45 Korea
51 Vietnam
55 Britain
55 Prussia
56 Iran
57 Britain
58 Russia
58 Britain
58 Turkey
59 Syria
62 Turkey
64 Russia
68 Tanzania
73 Turkey
77 Vietnam
80 Vietnam
87 Turkey
90 Korea
96 Russia
101 Turkey
104 Russia
107 Iran
114 Russia
115 Syria


In [8]:
set(all_nan_country_code)

{'Britain',
 'England',
 'Iran',
 'Korea',
 'North Korea',
 'Prussia',
 'Russia',
 'South Korea',
 'Soviet Union',
 'Syria',
 'Tanzania',
 'Turkey',
 'Vietnam'}

Create seperate dictionary for countries that there is no country code using pycountry

In [9]:
# Custom mapping for countries not directly in pycountry
custom_country_codes_3_alpha = {
    "Britain": "GBR",
    "England": "GBR",
    "Iran": "IRN",
    "Korea": "PRK",
    "North Korea": "PRK",
    "Prussia": "DEU",  # Part of modern Germany
    "Russia": "RUS",
    "South Korea": "KOR",
    "Soviet Union": "SUN",  # Historical
    "Syria": "SYR",
    "Tanzania": "TZA",
    "Turkey": "TUR",
    "Vietnam": "VNM"
}

custom_country_codes_2_alpha = {
    "Britain": "GB",
    "England": "GB",
    "Iran": "IR",
    "Korea": "PR",
    "North Korea": "PR",
    "Prussia": "DE",  # Part of modern Germany
    "Russia": "RU",
    "South Korea": "KO",
    "Soviet Union": "SU",  # Historical
    "Syria": "SY",
    "Tanzania": "TZ",
    "Turkey": "TU",
    "Vietnam": "VN"
}

#### Update the wars df with the participants countries code

In [10]:
# Function to convert participant names to country codes
def get_country_codes(participants):
    countries = []
    for country_name in participants.split(","):
        country_name = country_name.strip()
        country = pycountry.countries.get(name=country_name)
        if country:
            countries.append(country.alpha_2)
        else: 
            countries.append(custom_country_codes_2_alpha[country_name])
    return ", ".join(countries)

In [11]:
war_merged_data.loc[:, "Country Codes"] = war_merged_data["Countries Participants"].apply(get_country_codes)


In [12]:
war_merged_data.head(10)

Unnamed: 0,Date,War Name,Combatants,Location,List of War Years,Countries Participants,Country Codes
0,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"[1939, 1940, 1941, 1942, 1943, 1944, 1945]","United States, Soviet Union, United Kingdom, C...","US, SU, GB, CN, FR, DE, IT, JP"
1,1207–1405,Mongol invasions and conquests,Mongol Empire vs. various states in Eurasia,Asia and Europe,"[1207, 1208, 1209, 1210, 1211, 1212, 1213, 121...","Mongolia, China, Russia","MN, CN, RU"
2,220–280,Three Kingdoms,Multiple sides,China,[],China,CN
3,1850–1864,Taiping Rebellion,Qing Dynasty vs. Taiping Heavenly Kingdom,China,"[1850, 1851, 1852, 1853, 1854, 1855, 1856, 185...",China,CN
4,1914–1918,World War I,Allied Powers vs. Central Powers,Global,"[1914, 1915, 1916, 1917, 1918]","France, Russia, United Kingdom, Italy, United ...","FR, RU, GB, IT, US, DE, AT, TU, BG"
5,1618–1683,Manchu Conquest of China,Manchu vs. Ming Dynasty,China,"[1618, 1619, 1620, 1621, 1622, 1623, 1624, 162...",China,CN
6,1369–1405,Conquests of Timur,Timurid Empire vs. various states in Asia,"Central Asia, West Asia, and South Asia","[1369, 1370, 1371, 1372, 1373, 1374, 1375, 137...","Uzbekistan, Iran, India","UZ, IR, IN"
8,1618–1648,Thirty Years' War,Anti-Imperial Alliance vs. Imperial Alliance,Europe,"[1618, 1619, 1620, 1621, 1622, 1623, 1624, 162...","France, Spain, Sweden, Germany","FR, ES, SE, DE"
9,1519–1530,Spanish conquest of Mexico,Spanish Empire and allies vs. Aztec Empire and...,Mexico,"[1519, 1520, 1521, 1522, 1523, 1524, 1525, 152...","Spain, Mexico","ES, MX"
10,1533–1572,Spanish conquest of the Inca Empire,Spanish Empire vs. Inca Empire,South America,"[1533, 1534, 1535, 1536, 1537, 1538, 1539, 154...","Spain, Peru","ES, PE"


In [154]:
merged_wars_data_file = os.path.join(const.data_base_dir, "merged_war_data.csv")
war_merged_data.to_csv(merged_wars_data_file, index=False)

#### Expand the data frame for each combination of Country Codes and List of War Years 

In [21]:
merged_wars_data_file = merged_wars_data_file = os.path.join(const.data_base_dir, "merged_war_data.csv")
war_merged_data = pd.read_csv(merged_wars_data_file)

In [22]:
def expand_dataframe(row):
    years = row["List of War Years"]
    codes = row["Country Codes"]
    min_year = min(years) if years else None  # Compute the minimum year or None if the list is empty
    max_year = max(years) if years else None  # Compute the maximum year or None if the list is empty
    return pd.DataFrame(
        list(product(years, codes)), columns=["Year", "Country Code"]
    ).assign(Min_Year=min_year, Max_Year=max_year, **row.drop(["List of War Years", "Country Codes"]))

# Apply the expansion and concatenate results
war_merged_data['List of War Years'] = war_merged_data['List of War Years'].apply(ast.literal_eval)
war_merged_data['Country Codes'] = war_merged_data['Country Codes'].apply(lambda x: x.split(', '))
expanded_df = pd.concat(war_merged_data.apply(expand_dataframe, axis=1).to_list(), ignore_index=True)

In [23]:
expanded_df.head()

Unnamed: 0,Year,Country Code,Min_Year,Max_Year,Date,War Name,Combatants,Location,Countries Participants
0,1939,US,1939,1945,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"United States, Soviet Union, United Kingdom, C..."
1,1939,SU,1939,1945,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"United States, Soviet Union, United Kingdom, C..."
2,1939,GB,1939,1945,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"United States, Soviet Union, United Kingdom, C..."
3,1939,CN,1939,1945,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"United States, Soviet Union, United Kingdom, C..."
4,1939,FR,1939,1945,1939–1945,World War II,Allied Powers vs. Axis Powers,Global,"United States, Soviet Union, United Kingdom, C..."


In [24]:
merged_wars_data_file = os.path.join(const.data_base_dir, "merged_war_data_after_expand.csv")
expanded_df.to_csv(merged_wars_data_file, index=False)