# Convert each of the xls files to csv first

In [3]:
import os
import pandas as pd
from pathlib import Path

def convert_xls_to_csv(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.xls'):
            # Construct full file paths
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename.replace('.xls', '.csv'))

            try:
                # Read the XLS file
                df = pd.read_excel(input_path)

                # Write to CSV
                df.to_csv(output_path, index=False)
                print(f"Converted {filename} to CSV successfully.")
            except Exception as e:
                print(f"Error converting {filename}: {str(e)}")

# Set up the input and output folders
try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

input_folder = os.path.join(current_dir, 'excel_files')
output_folder = os.path.join(current_dir, 'csv_files')

convert_xls_to_csv(input_folder, output_folder)


Converted TOP500_199306.xls to CSV successfully.
Converted TOP500_199311.xls to CSV successfully.
Converted TOP500_199406.xls to CSV successfully.
Converted TOP500_199411.xls to CSV successfully.
Converted TOP500_199506.xls to CSV successfully.
Converted TOP500_199511.xls to CSV successfully.
Converted TOP500_199606.xls to CSV successfully.
Converted TOP500_199611.xls to CSV successfully.
Converted TOP500_199706.xls to CSV successfully.
Converted TOP500_199711.xls to CSV successfully.
Converted TOP500_199806.xls to CSV successfully.
Converted TOP500_199811.xls to CSV successfully.
Converted TOP500_199906.xls to CSV successfully.
Converted TOP500_199911.xls to CSV successfully.
Converted TOP500_200006.xls to CSV successfully.
Converted TOP500_200011.xls to CSV successfully.
Converted TOP500_200106.xls to CSV successfully.
Converted TOP500_200111.xls to CSV successfully.
Converted TOP500_200206.xls to CSV successfully.
Converted TOP500_200211.xls to CSV successfully.
Converted TOP500_200

# Now XLSX files as well

In [4]:
import os
import pandas as pd
from pathlib import Path

def convert_xlsx_to_csv(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.xlsx'):
            # Construct full file paths
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename.replace('.xlsx', '.csv'))

            try:
                # Read the XLSX file
                df = pd.read_excel(input_path)

                # Write to CSV
                df.to_csv(output_path, index=False)
                print(f"Converted {filename} to CSV successfully.")
            except Exception as e:
                print(f"Error converting {filename}: {str(e)}")

# Set up the input and output folders
try:
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_dir = os.getcwd()

input_folder = os.path.join(current_dir, 'excel_files')
output_folder = os.path.join(current_dir, 'csv_files')

convert_xlsx_to_csv(input_folder, output_folder)


Converted TOP500_202006.xlsx to CSV successfully.
Converted TOP500_202011.xlsx to CSV successfully.
Converted TOP500_202106.xlsx to CSV successfully.
Converted TOP500_202111.xlsx to CSV successfully.
Converted TOP500_202206.xlsx to CSV successfully.
Converted TOP500_202211.xlsx to CSV successfully.
Converted TOP500_202306.xlsx to CSV successfully.
Converted TOP500_202311.xlsx to CSV successfully.
Converted TOP500_202406.xlsx to CSV successfully.
Converted TOP500_202411.xlsx to CSV successfully.
