In [20]:
import pandas as pd

# Load the summerOly_athletes.csv dataset
athletes_path = './data/cleaned_summerOly_atheletes.csv'
athletes_df = pd.read_csv(athletes_path)

# Remove duplicate entries for the same athlete (Name), Year, and NOC
unique_athletes_df = athletes_df.drop_duplicates(subset=['Year', 'NOC', 'Name'])

# Group by Year and NOC to calculate the total number of unique athletes
total_athletes_feature = unique_athletes_df.groupby(['Year', 'NOC']).size()

# Convert to a DataFrame for further processing
total_athletes_feature = total_athletes_feature.reset_index(name='Total_Athletes')

# Use this feature directly in the linear regression workflow
total_athletes_feature.head()

# Identify the most frequently occurring Team for each NOC
most_common_team_per_noc = (
    athletes_df.groupby('NOC')['Team']
    .agg(lambda x: x.mode()[0])  # Mode gives the most frequent value
    .reset_index()
    .rename(columns={'Team': 'Most_Frequent_Team'})
)

#Replace NOC with its most frequent Team value
athletes_df['NOC'] = athletes_df['NOC'].map(
    most_common_team_per_noc.set_index('NOC')['Most_Frequent_Team']
)

# Remove duplicate entries for the same athlete (Name), Year, and NOC
unique_athletes_df = athletes_df.drop_duplicates(subset=['Year', 'NOC', 'Name'])

# Group by Year and NOC to calculate the total number of unique athletes
total_athletes_by_noc = unique_athletes_df.groupby(['Year', 'NOC']).size().reset_index(name='Total_Athletes')

# Save the resulting DataFrame to a CSV file
output_path = 'total_unique_athletes_by_noc.csv'  # Change path if needed
total_athletes_by_noc.to_csv(output_path, index=False)

# Display the first few rows for verification
total_athletes_by_noc.head()


Unnamed: 0,Year,NOC,Total_Athletes
0,1896,Australia,1
1,1896,Austria,3
2,1896,Denmark,3
3,1896,France,12
4,1896,Germany,19


In [28]:
import pandas as pd

# Load the summerOly_medal_counts.csv dataset
medal_counts_path = './data/summerOly_medal_counts.csv'
medal_counts_df = pd.read_csv(medal_counts_path)

# Ensure no duplicates in the original dataset
medal_counts_df = medal_counts_df.groupby(['Year', 'NOC'], as_index=False).sum()

# Generate a complete list of all unique years and NOCs
all_years = medal_counts_df['Year'].unique()
all_nocs = medal_counts_df['NOC'].unique()

# Create a DataFrame with all possible combinations of Year and NOC
all_combinations = pd.MultiIndex.from_product(
    [all_years, all_nocs], names=['Year', 'NOC']
).to_frame(index=False)

# Merge the original medal counts with the full combination DataFrame
complete_medal_counts = pd.merge(
    all_combinations,
    medal_counts_df,
    on=['Year', 'NOC'],
    how='left'
)

# Fill missing values in the medal counts with 0
complete_medal_counts.fillna({'Gold': 0, 'Silver': 0, 'Bronze': 0, 'Total': 0}, inplace=True)

# Ensure all counts are integers
complete_medal_counts[['Gold', 'Silver', 'Bronze', 'Total']] = complete_medal_counts[['Gold', 'Silver', 'Bronze', 'Total']].astype(int)

# Verify no duplicates in the resulting DataFrame
duplicates_in_merged = complete_medal_counts.duplicated(subset=['Year', 'NOC'])
print(f"Number of duplicates in merged DataFrame: {duplicates_in_merged.sum()}")

# Save the resulting DataFrame to a CSV file
output_path = 'complete_medal_counts.csv'
complete_medal_counts.to_csv(output_path, index=False)

# Display the first few rows for verification
complete_medal_counts.head()


Number of duplicates in merged DataFrame: 0


Unnamed: 0,Year,NOC,Rank,Gold,Silver,Bronze,Total
0,1896,Australia,8.0,2,0,0,2
1,1896,Austria,7.0,2,1,2,5
2,1896,Denmark,9.0,1,2,3,6
3,1896,France,4.0,5,4,2,11
4,1896,Germany,3.0,6,5,2,13


In [26]:
assert complete_medal_counts.duplicated(subset=['Year', 'NOC']).sum() == 0


In [19]:
import pandas as pd

# Load the summerOly_hosts.csv dataset
hosts_path = './data/summerOly_hosts.csv'
hosts_df = pd.read_csv(hosts_path)

# Strip any leading/trailing whitespace from column names
hosts_df.columns = hosts_df.columns.str.strip()

# Rename columns if necessary to standardize (e.g., "ï»¿Year" becomes "Year")
hosts_df.rename(columns={'ï»¿Year': 'Year'}, inplace=True)

# Extract only the country (or NOC) part from the Host column
hosts_df['host_NOC'] = hosts_df['Host'].str.extract(r',\s*(.*)')[0]

# Handle years with no host
# Add entries for years with no host
no_host_years = [1916, 1940, 1944, 2020]
for year in no_host_years:
    hosts_df = pd.concat([hosts_df, pd.DataFrame({'Year': [year], 'host_NOC': ['No Host']})])

# Replace blank values or any inconsistent entries with "No Host"
hosts_df['host_NOC'] = hosts_df['host_NOC'].fillna('No Host')
hosts_df['host_NOC'] = hosts_df['host_NOC'].replace('', 'No Host')

# Remove duplicate rows based on 'Year' to ensure one entry per year
hosts_df = hosts_df.drop_duplicates(subset=['Year'], keep='first')

# Ensure all years are included for completeness
hosts_df.sort_values(by='Year', inplace=True)

# Save the modified DataFrame to a new CSV file
output_path = 'summerOly_hosts_with_host_NOC_cleaned.csv'
hosts_df.to_csv(output_path, index=False)

# Display the resulting DataFrame for verification
hosts_df.head()


Unnamed: 0,Year,Host,host_NOC
0,1896,"Athens, Greece",Greece
1,1900,"Paris, France",France
2,1904,"St. Louis, United States",United States
3,1908,"London, United Kingdom",United Kingdom
4,1912,"Stockholm, Sweden",Sweden


In [20]:
import pandas as pd

# Load the total_unique_athletes.csv
athletes_path = 'total_unique_athletes_by_noc.csv'
total_athletes_df = pd.read_csv(athletes_path)

# Load the medal counts dataset (to get all possible NOCs and Years)
medal_counts_path = './data/summerOly_medal_counts.csv'
medal_counts_df = pd.read_csv(medal_counts_path)

# Create a complete list of all unique Years and NOCs
all_years = medal_counts_df['Year'].unique()
all_nocs = medal_counts_df['NOC'].unique()
all_combinations = pd.MultiIndex.from_product([all_years, all_nocs], names=['Year', 'NOC']).to_frame(index=False)

# Merge the total_athletes_df with the complete list of Years and NOCs
complete_total_athletes_df = pd.merge(
    all_combinations,
    total_athletes_df,
    on=['Year', 'NOC'],
    how='left'
)

# Fill missing values with 0 for countries that did not participate
complete_total_athletes_df['Total_Athletes'] = complete_total_athletes_df['Total_Athletes'].fillna(0).astype(int)

# Save the resulting DataFrame to a CSV file
output_path = 'complete_total_unique_athletes_by_noc.csv'
complete_total_athletes_df.to_csv(output_path, index=False)

# Display the first few rows for verification
complete_total_athletes_df.head()


Unnamed: 0,Year,NOC,Total_Athletes
0,1896,United States,14
1,1896,Greece,102
2,1896,Germany,19
3,1896,France,12
4,1896,Great Britain,10


In [25]:
import pandas as pd

# Load the complete total unique athletes dataset
athletes_path = 'cleaned_complete_total_unique_athletes_by_noc.csv'
total_athletes_df = pd.read_csv(athletes_path)

# Load the complete total medals dataset
medals_path = 'cleaned_complete_medal_counts.csv'
total_medals_df = pd.read_csv(medals_path)

# Keep only necessary columns from the medals dataset and rename 'Total' to 'Total_Medal_Count'
total_medals_df = total_medals_df[['Year', 'NOC', 'Total']].rename(columns={'Total': 'Total_Medal_Count'})

# Merge the two datasets on Year and NOC
merged_df = pd.merge(
    total_athletes_df,
    total_medals_df,
    on=['Year', 'NOC'],
    how='outer'
).fillna(0)

# Ensure numeric columns are of integer type after filling NaNs
numeric_columns = ['Total_Athletes', 'Total_Medal_Count']
merged_df[numeric_columns] = merged_df[numeric_columns].astype(int)

# Save the resulting concatenated DataFrame to a CSV file
output_path = 'cleaned_concatenated_athletes_medals.csv'
merged_df.to_csv(output_path, index=False)

# Display the first few rows for verification
merged_df.head()


Unnamed: 0,index,Year,NOC,Total_Athletes,Total_Medal_Count
0,0.0,1896,Afghanistan,0,0
1,1.0,1896,Albania,0,0
2,2.0,1896,Algeria,0,0
3,3.0,1896,American Samoa,0,0
4,4.0,1896,Andorra,0,0


In [27]:
import pandas as pd

# Load the concatenated athletes and medals dataset
athletes_medals_path = 'cleaned_concatenated_athletes_medals.csv'
merged_df = pd.read_csv(athletes_medals_path)

# Load the cleaned hosts dataset
hosts_path = 'summerOly_hosts_with_host_NOC_cleaned.csv'
hosts_df = pd.read_csv(hosts_path)

# Merge the hosts dataset to include host_NOC
merged_with_hosts = pd.merge(
    merged_df,
    hosts_df[['Year', 'host_NOC']],
    on='Year',
    how='left'
)

# Create the host_status feature
merged_with_hosts['host_status'] = (merged_with_hosts['host_NOC'] == merged_with_hosts['NOC']).astype(int)

# Handle missing values
merged_with_hosts['Total_Athletes'] = merged_with_hosts['Total_Athletes'].fillna(0).astype(int)
merged_with_hosts['Total_Medal_Count'] = merged_with_hosts['Total_Medal_Count'].fillna(0).astype(int)
merged_with_hosts['host_status'] = merged_with_hosts['host_status'].fillna(0).astype(int)


# Drop the host_NOC column as it's no longer needed
merged_with_hosts.drop(columns=['host_NOC'], inplace=True)

# Save the resulting DataFrame to a CSV file
output_path = 'cleaned_concatenated_with_host_status.csv'
merged_with_hosts.to_csv(output_path, index=False)

# Display the first few rows for verification
merged_with_hosts.head()


Unnamed: 0,index,Year,NOC,Total_Athletes,Total_Medal_Count,host_status
0,0.0,1896,Afghanistan,0,0,0
1,1.0,1896,Albania,0,0,0
2,2.0,1896,Algeria,0,0,0
3,3.0,1896,American Samoa,0,0,0
4,4.0,1896,Andorra,0,0,0


In [22]:
import pandas as pd

# Load the dataset
medal_counts_path = './data/cleaned_summerOly_medal_count.csv'
medal_counts_df = pd.read_csv(medal_counts_path)

# Normalize and deduplicate the dataset
medal_counts_df['NOC'] = medal_counts_df['NOC'].str.strip().str.title()
medal_counts_df = medal_counts_df.drop_duplicates(subset=['Year', 'NOC'])

# Group by Year and NOC, summing only the relevant columns
medal_counts_df = medal_counts_df.groupby(['Year', 'NOC'], as_index=False)[['Gold', 'Silver', 'Bronze', 'Total']].sum()

# Generate all unique years and NOCs
all_years = medal_counts_df['Year'].drop_duplicates()
all_nocs = medal_counts_df['NOC'].drop_duplicates()

# Create all combinations of Year and NOC
all_combinations = pd.MultiIndex.from_product(
    [all_years, all_nocs], names=['Year', 'NOC']
).to_frame(index=False)

# Merge the full combinations with the original dataset
complete_medal_counts = pd.merge(
    all_combinations,
    medal_counts_df,
    on=['Year', 'NOC'],
    how='left'
)

# For non-participating countries, fill missing medal counts with zeros
complete_medal_counts.fillna({'Gold': 0, 'Silver': 0, 'Bronze': 0, 'Total': 0}, inplace=True)

# Ensure all counts are integers
complete_medal_counts[['Gold', 'Silver', 'Bronze', 'Total']] = complete_medal_counts[['Gold', 'Silver', 'Bronze', 'Total']].astype(int)

# Drop unintended duplicates after merging
complete_medal_counts = complete_medal_counts.drop_duplicates(subset=['Year', 'NOC'])

# Save the resulting dataset
output_path = 'cleaned_complete_medal_counts.csv'
complete_medal_counts.to_csv(output_path, index=False)

# Verify the result
print(f"Number of duplicates in final DataFrame: {complete_medal_counts.duplicated(subset=['Year', 'NOC']).sum()}")
complete_medal_counts.head()


Number of duplicates in final DataFrame: 0


Unnamed: 0,Year,NOC,Gold,Silver,Bronze,Total
0,1896,Australia,2,0,0,2
1,1896,Austria,2,1,2,5
2,1896,Denmark,1,2,3,6
3,1896,France,5,4,2,11
4,1896,Germany,6,5,2,13


In [23]:
import pandas as pd

# Load the total_unique_athletes.csv dataset
athletes_path = 'total_unique_athletes_by_noc.csv'
total_athletes_df = pd.read_csv(athletes_path)

# Normalize and deduplicate the dataset
total_athletes_df['NOC'] = total_athletes_df['NOC'].str.strip()


total_athletes_df = total_athletes_df.groupby(['Year', 'NOC'], as_index=False).sum()

total_athletes_df[total_athletes_df['NOC'] == 'Argentina']

# # Get all unique years and NOCs
all_years = total_athletes_df['Year'].unique()
all_nocs = total_athletes_df['NOC'].unique()

# # Create a MultiIndex with all combinations
all_combinations = pd.MultiIndex.from_product([all_years, all_nocs], names=['Year', 'NOC'])

# # Reindex the DataFrame using the MultiIndex to create all combinations
complete_total_athletes = total_athletes_df.set_index(['Year', 'NOC']).reindex(all_combinations).reset_index()

# # Fill NaN values with 0 and convert to integer
complete_total_athletes['Total_Athletes'] = complete_total_athletes['Total_Athletes'].fillna(0).astype(int)

# # Sort the data
complete_total_athletes = complete_total_athletes.sort_values(['Year', 'NOC']).reset_index(drop=True)

# Verify the result
# print(f"Number of duplicates in final DataFrame: {complete_total_athletes.duplicated(subset=['Year', 'NOC']).sum()}")
# print(complete_total_athletes[complete_total_athletes['NOC'] == 'Argentina'].head(20))

complete_total_athletes = complete_total_athletes.reset_index()

complete_total_athletes[complete_total_athletes['NOC'] == 'Argentina']

complete_total_athletes.to_csv("cleaned_complete_total_unique_athletes_by_noc.csv", index=False)

In [46]:
df = pd.read_csv("complete_total_unique_athletes_by_noc.csv")
print(df[df['Year'] == 1896].head())


   Year             NOC    0  Total_Athletes
0  1896     Afghanistan  0.0               0
1  1896             Ain  0.0               0
2  1896         Albania  0.0               0
3  1896         Algeria  0.0               0
4  1896  American Samoa  0.0               0


In [52]:
print(complete_total_athletes.head(40))



    Year                     NOC  Total_Athletes
0   1896             Afghanistan               0
1   1896                     Ain               0
2   1896                 Albania               0
3   1896                 Algeria               0
4   1896          American Samoa               0
5   1896                 Andorra               0
6   1896                  Angola               0
7   1896     Antigua And Barbuda               0
8   1896               Argentina               0
9   1896                 Armenia               0
10  1896                   Aruba               0
11  1896             Australasia               0
12  1896               Australia               1
13  1896                 Austria               3
14  1896              Azerbaijan               0
15  1896                 Bahamas               0
16  1896                 Bahrain               0
17  1896              Bangladesh               0
18  1896                Barbados               0
19  1896            

In [53]:
pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m[31m1.8 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:  # Open in binary mode
        rawdata = f.read()
    result = chardet.detect(rawdata)
    return result['encoding']

athletes_path = './data/summerOly_medal_counts.csv'
detected_encoding = detect_encoding(athletes_path)
print(f"Detected encoding: {detected_encoding}")

Detected encoding: utf-8


In [3]:
import pandas as pd

athletes_path = 'total_unique_athletes_by_noc.csv'
encodings_to_try = [ 'latin-1', 'cp1252', 'ISO-8859-1']  # Common encodings

for encoding in encodings_to_try:
    try:
        total_athletes_df = pd.read_csv(athletes_path, encoding=encoding, dtype={'Year': int})
        print(f"File successfully read using {encoding} encoding.")
        break  # Exit the loop if successful
    except UnicodeDecodeError:
        print(f"Failed to decode with {encoding}. Trying next encoding...")
else:  # This 'else' is executed if the loop completes without a 'break'
    raise UnicodeDecodeError("Could not decode file with any of the tested encodings.")


File successfully read using latin-1 encoding.


In [28]:
import os
import shutil
import re

def move_cleaned_files(source_dir, destination_dir):
    """
    Moves all files containing the word "cleaned" (case-insensitive)
    from the source directory to the destination directory.

    Args:
        source_dir: The path to the source directory.
        destination_dir: The path to the destination directory.
    """

    try:
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_dir, exist_ok=True)

        for filename in os.listdir(source_dir):
            if re.search(r"cleaned", filename, re.IGNORECASE):  # Case-insensitive search
                source_path = os.path.join(source_dir, filename)
                destination_path = os.path.join(destination_dir, filename)

                # Move the file
                shutil.move(source_path, destination_path)
                print(f"Moved: {filename} to {destination_dir}")

    except FileNotFoundError:
        print(f"Error: Source directory '{source_dir}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    source_directory = "."  # Current directory (you can change this)
    destination_directory = "model_testing"

    move_cleaned_files(source_directory, destination_directory)
    print("Finished moving files.")

Moved: cleaned_concatenated_with_host_status.csv to model_testing
Moved: cleaned_concatenated_athletes_medals.csv to model_testing
Moved: cleaned_complete_total_unique_athletes_by_noc.csv to model_testing
Moved: summerOly_hosts_with_host_NOC_cleaned.csv to model_testing
Moved: cleaned_complete_medal_counts.csv to model_testing
Finished moving files.
