In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('global_salary_data.csv')

# Basic information about the dataset
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country_name    221 non-null    object 
 1   continent_name  221 non-null    object 
 2   wage_span       221 non-null    object 
 3   median_salary   221 non-null    float64
 4   average_salary  221 non-null    float64
 5   lowest_salary   221 non-null    float64
 6   highest_salary  221 non-null    float64
dtypes: float64(4), object(3)
memory usage: 12.2+ KB
None
       median_salary  average_salary  lowest_salary  highest_salary
count     221.000000      221.000000     221.000000      221.000000
mean     1762.631906     1982.339812     502.783204     8802.165619
std      1634.708716     1835.429193     470.073328     8140.210641
min         0.261335        0.285524       0.072092        1.271103
25%       567.210000      651.000000     163.930000     2900.480000
50%      1227.460000     

In [2]:
# Print column names to verify
print(df.columns)

Index(['country_name', 'continent_name', 'wage_span', 'median_salary',
       'average_salary', 'lowest_salary', 'highest_salary'],
      dtype='object')


In [3]:
# DATA CLEANING

# Rename columns for consistency and readability
df.rename(columns={
    'country_name': 'Country',
    'continent_name': 'Continent',
    'wage_span': 'Wage_Span',
    'median_salary': 'Median_Salary',
    'average_salary': 'Average_Salary',
    'lowest_salary': 'Lowest_Salary',
    'highest_salary': 'Highest_Salary'
}, inplace=True)

print("New column names:", df.columns)

New column names: Index(['Country', 'Continent', 'Wage_Span', 'Median_Salary', 'Average_Salary',
       'Lowest_Salary', 'Highest_Salary'],
      dtype='object')


In [4]:
# Check for any missing value in the columns

print(df.isnull().sum())

Country           0
Continent         0
Wage_Span         0
Median_Salary     0
Average_Salary    0
Lowest_Salary     0
Highest_Salary    0
dtype: int64


In [5]:
# Check how many elements the data has

print(df.shape)

(221, 7)


In [7]:
# Confirm data entries by printing the first 10 rows/elements

print(df.head(10))

               Country      Continent Wage_Span  Median_Salary  \
0          Afghanistan           Asia   Monthly         853.74   
1        Aland Islands         Europe   Monthly        3319.24   
2              Albania         Europe   Monthly         832.84   
3              Algeria         Africa   Monthly        1148.84   
4       American Samoa        Oceania   Monthly        1390.00   
5              Andorra         Europe   Monthly        3668.08   
6               Angola         Africa   Monthly         284.39   
7  Antigua and Barbuda      Caribbean   Monthly        1548.15   
8            Argentina  South America   Monthly         110.28   
9              Armenia           Asia   Monthly        1700.25   

   Average_Salary  Lowest_Salary  Highest_Salary  
0         1001.15         252.53         4460.97  
1         3858.35         972.52        17124.74  
2          956.92         241.22         4258.49  
3         1308.81         330.11         5824.18  
4         1570.00 

In [8]:
# DATA EXPLORATION

# Validate unique continents and countries
unique_continents = df['Continent'].unique()
unique_countries = df['Country'].unique()

print(f"Unique continents: {len(unique_continents)}")
print(f"Unique countries: {len(unique_countries)}")


# Check for total number of countries and continents in the data table
total_continents = len(unique_continents)
total_countries = len(unique_countries)

print(f"Total number of continents: {total_continents}")
print(f"Total number of countries: {total_countries}")

Unique continents: 9
Unique countries: 221
Total number of continents: 9
Total number of countries: 221


In [21]:
# Countries with the highest average salary
top_countries_average_salary = df.groupby('Country')['Average_Salary'].sum().sort_values(ascending=False).head(10)
print("Top 10 countries with highest average salary:\n", top_countries_average_salary)

# Countries with the highest salary
top_countries_highest_salary = df.groupby('Country')['Highest_Salary'].sum().sort_values(ascending=False).head(5)
print("Top 5 countries with highest salary earned globally:\n", top_countries_highest_salary)

# Countries with the lowest salary
top_countries_lowest_salary = df.groupby('Country')['Lowest_Salary'].sum().sort_values(ascending=True).head(5)
print("Top 5 countries with lowest salary earned globally:\n", top_countries_lowest_salary)

Top 10 countries with highest average salary:
 Country
Switzerland       11292.90
Guernsey           9409.76
United States      7925.00
Canada             7352.94
United Kingdom     7235.37
Belgium            6522.20
Jersey             6304.88
Singapore          6235.29
Liechtenstein      5825.14
Denmark            5779.04
Name: Average_Salary, dtype: float64
Top 5 countries with highest salary earned globally:
 Country
Switzerland       50363.93
Guernsey          41869.51
United States     35250.00
Canada            32720.59
United Kingdom    32214.63
Name: Highest_Salary, dtype: float64
Top 5 countries with lowest salary earned globally:
 Country
Zambia         0.072092
Syria          2.900000
Mauritania    13.203171
Sudan         18.800000
Uzbekistan    27.070000
Name: Lowest_Salary, dtype: float64


In [9]:
# DATA ANALYSIS

# Descriptive analysis: Summary statistics
summary_statistics = df.describe()
print(summary_statistics)

       Median_Salary  Average_Salary  Lowest_Salary  Highest_Salary
count     221.000000      221.000000     221.000000      221.000000
mean     1762.631906     1982.339812     502.783204     8802.165619
std      1634.708716     1835.429193     470.073328     8140.210641
min         0.261335        0.285524       0.072092        1.271103
25%       567.210000      651.000000     163.930000     2900.480000
50%      1227.460000     1344.230000     339.450000     5974.360000
75%      2389.010000     2740.000000     690.000000    12050.740000
max      9836.070000    11292.900000    2850.270000    50363.930000


In [17]:
# Comparative analysis to compare salaries across continents

average_salary_by_continent = df.groupby('Continent')['Average_Salary'].mean().sort_values(ascending=False)
print(average_salary_by_continent)

Continent
Northern America    5221.610000
Europe              3576.279912
North America       2647.316667
Oceania             1946.011667
Central America     1892.721250
Asia                1727.657722
Caribbean           1617.098333
South America       1526.053077
Africa               772.719863
Name: Average_Salary, dtype: float64


In [23]:
# Prescriptive analysis: Identifying countries with highest and lowest salaries

top_5_countries_highest_salary = df.nlargest(5, 'Average_Salary')
top_5_countries_lowest_salary = df.nsmallest(5, 'Average_Salary')

print("Top 5 countries with highest average salary:")
print(top_5_countries_highest_salary)

print("Top 5 countries with lowest average salary:")
print(top_5_countries_lowest_salary)

Top 5 countries with highest average salary:
            Country         Continent Wage_Span  Median_Salary  \
192     Switzerland            Europe   Monthly        9836.07   
83         Guernsey            Europe   Monthly        8689.02   
209   United States  Northern America   Monthly        6966.00   
35           Canada  Northern America   Monthly        6311.03   
208  United Kingdom            Europe   Monthly        6300.00   

     Average_Salary  Lowest_Salary  Highest_Salary  
192        11292.90        2850.27        50363.93  
83          9409.76        2367.07        41869.51  
209         7925.00        2000.00        35250.00  
35          7352.94        1850.00        32720.59  
208         7235.37        1829.27        32214.63  
Top 5 countries with lowest average salary:
        Country Continent Wage_Span  Median_Salary  Average_Salary  \
219      Zambia    Africa   Monthly       0.261335        0.285524   
193       Syria      Asia   Monthly      10.120000      

In [24]:
# Save DataFrame to Excel

df.to_excel('cleaned_global_salary_data.xlsx', index=False)