# Notebook for Basic Data Cleaning

In [1]:
# importing necessary libraries

import pandas as pd
import numpy as np


In [2]:
# Load  the dataset

cie_df = pd.read_csv(r'C:\Users\sakhe\OneDrive\Desktop\Personal Projects_2025\Cost of International Education\Data\International_Education_Costs.csv')

# Display the first few rows of the dataframe
cie_df.head(20)

Unnamed: 0,Country,City,University,Program,Level,Duration_Years,Tuition_USD,Living_Cost_Index,Rent_USD,Visa_Fee_USD,Insurance_USD,Exchange_Rate
0,USA,Cambridge,Harvard University,Computer Science,Master,2.0,55400,83.5,2200,160,1500,1.0
1,UK,London,Imperial College London,Data Science,Master,1.0,41200,75.8,1800,485,800,0.79
2,Canada,Toronto,University of Toronto,Business Analytics,Master,2.0,38500,72.5,1600,235,900,1.35
3,Australia,Melbourne,University of Melbourne,Engineering,Master,2.0,42000,71.2,1400,450,650,1.52
4,Germany,Munich,Technical University of Munich,Mechanical Engineering,Master,2.0,500,70.5,1100,75,550,0.92
5,Japan,Tokyo,University of Tokyo,Information Science,Master,2.0,8900,76.4,1300,220,750,145.8
6,Netherlands,Amsterdam,University of Amsterdam,Artificial Intelligence,Master,1.0,15800,73.2,1500,180,720,0.92
7,Singapore,Singapore,National University of Singapore,Finance,Master,1.5,35000,81.1,1900,90,800,1.34
8,France,Paris,Sorbonne University,International Relations,Master,2.0,4500,74.6,1400,99,650,0.92
9,Switzerland,Zurich,ETH Zurich,Physics,Master,2.0,1460,91.5,2100,88,1200,0.89


In [None]:
# Display the shape of the dataframe

cie_df.shape

(907, 12)

## Data Cleaning

In [3]:
# Function to clean the dataset
def clean_data(cie_df):
    # Strip whitespace and standardize case for categorical columns
    categorical_cols = ['Country', 'City', 'University', 'Program', 'Level']
    for col in categorical_cols:
        cie_df[col] = cie_df[col].astype(str).str.strip().str.title()
        cie_df[col] = cie_df[col].astype('category')
    
    # Convert numerical columns to appropriate dtypes
    cie_df['Duration_Years'] = cie_df['Duration_Years'].astype('int64')
    cie_df['Tuition_USD'] = cie_df['Tuition_USD'].astype('int64')
    cie_df['Living_Cost_Index'] = cie_df['Living_Cost_Index'].astype('float64')
    cie_df['Rent_USD'] = cie_df['Rent_USD'].astype('int64')
    cie_df['Visa_Fee_USD'] = cie_df['Visa_Fee_USD'].astype('int64')
    cie_df['Insurance_USD'] = cie_df['Insurance_USD'].astype('int64')
    cie_df['Exchange_Rate'] = cie_df['Exchange_Rate'].astype('float64')
    
    return cie_df

cie_df_clean = clean_data(cie_df.copy())
cie_df_clean.head()


Unnamed: 0,Country,City,University,Program,Level,Duration_Years,Tuition_USD,Living_Cost_Index,Rent_USD,Visa_Fee_USD,Insurance_USD,Exchange_Rate
0,Usa,Cambridge,Harvard University,Computer Science,Master,2,55400,83.5,2200,160,1500,1.0
1,Uk,London,Imperial College London,Data Science,Master,1,41200,75.8,1800,485,800,0.79
2,Canada,Toronto,University Of Toronto,Business Analytics,Master,2,38500,72.5,1600,235,900,1.35
3,Australia,Melbourne,University Of Melbourne,Engineering,Master,2,42000,71.2,1400,450,650,1.52
4,Germany,Munich,Technical University Of Munich,Mechanical Engineering,Master,2,500,70.5,1100,75,550,0.92


In [None]:
# Rename columns for clarity

columns_to_rename = {
    'Tuition_USD': 'Tuition fee USD',
    'Duration_Years': 'Duration in Years',
    'Living_Cost_Index': 'Living Cost Index',
    'Rent_USD': 'Rent USD',
    'Visa_Fee_USD': 'Visa Fee USD',
    'Insurance_USD': 'Insurance USD',
    'Exchange_Rate': 'Exchange Rate'}

# Rename columns
cie_df_clean.rename(columns=columns_to_rename, inplace=True)
# Display the cleaned dataframe
cie_df_clean.head()

Unnamed: 0,Country,City,University,Program,Level,Duration in Years,Tuition fee USD,Living Cost Index,Rent USD,Visa Fee USD,Insurance USD,Exchange Rate
0,Usa,Cambridge,Harvard University,Computer Science,Master,2,55400,83.5,2200,160,1500,1.0
1,Uk,London,Imperial College London,Data Science,Master,1,41200,75.8,1800,485,800,0.79
2,Canada,Toronto,University Of Toronto,Business Analytics,Master,2,38500,72.5,1600,235,900,1.35
3,Australia,Melbourne,University Of Melbourne,Engineering,Master,2,42000,71.2,1400,450,650,1.52
4,Germany,Munich,Technical University Of Munich,Mechanical Engineering,Master,2,500,70.5,1100,75,550,0.92


In [7]:
# Save the cleaned dataframe to a new CSV file
cie_df_clean.to_csv(r'C:\Users\sakhe\OneDrive\Desktop\Personal Projects_2025\Cost of International Education\Data\International_Education_Costs_Cleaned.csv', index=False)