## Data Preparation
Data preparation is the process of cleaning, transforming, and enriching raw data into a desired format for better decision-making in less time. It is a crucial step before processing and analyzing data to ensure that the data is accurate, consistent, and complete.

### 1. Load Modules & Dataset

In [None]:
# 1. Load Modules & Dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
try:
    edu_df = pd.read_csv("education.csv")
    labour_df = pd.read_csv("labour.csv")
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(e)

In [None]:
# 2. Initial Inspection
print("--- Education Data Info ---")
edu_df.info()

In [None]:
print("\n--- Labour Data Info ---")
labour_df.info()

In [None]:
# 3. Clean and Filter Data

# Filter for South Africa
edu_sa = edu_df[edu_df["REF_AREA_LABEL"] == "South Africa"].copy()
labour_sa = labour_df[labour_df["REF_AREA_LABEL"] == "South Africa"].copy()

In [None]:
print(f"\nFound {len(edu_sa)} rows for South Africa in the education dataset.")
print(f"Found {len(labour_sa)} rows for South Africa in the labour dataset.")

In [None]:
# Select relevant columns and rename them for easier use
columns_to_keep = {
    "TIME_PERIOD": "Year",
    "INDICATOR_LABEL": "Indicator",
    "SEX_LABEL": "Sex",
    "AGE_LABEL": "Age",
    "OBS_VALUE": "Value"
}

edu_clean = edu_sa[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
labour_clean = labour_sa[list(columns_to_keep.keys())].rename(columns=columns_to_keep)

In [None]:
# 4. Handle Missing Values
print("\n--- Checking for missing values in cleaned Education data ---")
print(edu_clean.isnull().sum())

In [None]:
print("\n--- Checking for missing values in cleaned Labour data ---")
print(labour_clean.isnull().sum())

In [None]:
# 5. Generate Descriptive Statistics and Insights
print("\n--- Descriptive Statistics for Education Data (South Africa) ---")
print(edu_clean.describe())

In [None]:
print("\n--- Descriptive Statistics for Labour Data (South Africa) ---")
print(labour_clean.describe())

In [None]:
# Initial Visual Insight
plt.figure(figsize=(12, 5))

# Histogram for Education values
plt.subplot(1, 2, 1)
sns.histplot(edu_clean['Value'], kde=True)
plt.title('Distribution of Education Attainment (%)')
plt.xlabel('Percentage')
plt.ylabel('Frequency')

# Histogram for Labour values
plt.subplot(1, 2, 2)
sns.histplot(labour_clean['Value'], kde=True)
plt.title('Distribution of Labour Force Participation (%)')
plt.xlabel('Percentage')

plt.tight_layout()
plt.show()

In [3]:
"""geting infomation about the  about labour the number of rows and coloumns including 
if there are null values"""

labour_df.info()
labour_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8216 entries, 0 to 8215
Data columns (total 45 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   STRUCTURE               8216 non-null   object 
 1   STRUCTURE_ID            8216 non-null   object 
 2   ACTION                  8216 non-null   object 
 3   FREQ                    8216 non-null   object 
 4   FREQ_LABEL              8216 non-null   object 
 5   REF_AREA                8216 non-null   object 
 6   REF_AREA_LABEL          8216 non-null   object 
 7   INDICATOR               8216 non-null   object 
 8   INDICATOR_LABEL         8216 non-null   object 
 9   SEX                     8216 non-null   object 
 10  SEX_LABEL               8216 non-null   object 
 11  AGE                     8216 non-null   object 
 12  AGE_LABEL               8216 non-null   object 
 13  URBANISATION            8216 non-null   object 
 14  URBANISATION_LABEL      8216 non-null   

Unnamed: 0,STRUCTURE,STRUCTURE_ID,ACTION,FREQ,FREQ_LABEL,REF_AREA,REF_AREA_LABEL,INDICATOR,INDICATOR_LABEL,SEX,...,DATA_SOURCE_LABEL,UNIT_TYPE,UNIT_TYPE_LABEL,TIME_FORMAT,TIME_FORMAT_LABEL,COMMENT_OBS,OBS_STATUS,OBS_STATUS_LABEL,OBS_CONF,OBS_CONF_LABEL
0,datastructure,WB.DATA360:DS_DATA360(1.2),I,A,Annual,AFE,Africa Eastern and Southern,WB_WDI_SL_TLF_CACT_ZS,"Labor force participation rate, total (% of to...",_T,...,World Development Indicators (WDI),RATIO,Ratio,P1Y,Annual,,A,Normal value,PU,Public
1,datastructure,WB.DATA360:DS_DATA360(1.2),I,A,Annual,AFW,Africa Western and Central,WB_WDI_SL_TLF_CACT_ZS,"Labor force participation rate, total (% of to...",_T,...,World Development Indicators (WDI),RATIO,Ratio,P1Y,Annual,,A,Normal value,PU,Public
2,datastructure,WB.DATA360:DS_DATA360(1.2),I,A,Annual,ARB,Arab World,WB_WDI_SL_TLF_CACT_ZS,"Labor force participation rate, total (% of to...",_T,...,World Development Indicators (WDI),RATIO,Ratio,P1Y,Annual,,A,Normal value,PU,Public


In [4]:
edu_df.head(3)
edu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332 entries, 0 to 1331
Data columns (total 45 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   STRUCTURE               1332 non-null   object 
 1   STRUCTURE_ID            1332 non-null   object 
 2   ACTION                  1332 non-null   object 
 3   FREQ                    1332 non-null   object 
 4   FREQ_LABEL              1332 non-null   object 
 5   REF_AREA                1332 non-null   object 
 6   REF_AREA_LABEL          1332 non-null   object 
 7   INDICATOR               1332 non-null   object 
 8   INDICATOR_LABEL         1332 non-null   object 
 9   SEX                     1332 non-null   object 
 10  SEX_LABEL               1332 non-null   object 
 11  AGE                     1332 non-null   object 
 12  AGE_LABEL               1332 non-null   object 
 13  URBANISATION            1332 non-null   object 
 14  URBANISATION_LABEL      1332 non-null   

In [5]:
edu_sa = edu_df[edu_df["REF_AREA_LABEL"] == "South Africa"]
labour_sa = labour_df[labour_df["REF_AREA_LABEL"] == "South Africa"]

In [6]:
edu_sa.describe()

Unnamed: 0,TIME_PERIOD,OBS_VALUE,DECIMALS,UNIT_MULT,COMMENT_OBS
count,13.0,13.0,13.0,13.0,0.0
mean,2016.846154,8.306698,2.0,0.0,
std,4.140172,2.77501,0.0,0.0,
min,2010.0,4.839276,2.0,0.0,
25%,2014.0,6.038021,2.0,0.0,
50%,2017.0,8.02,2.0,0.0,
75%,2020.0,11.108956,2.0,0.0,
max,2023.0,12.96,2.0,0.0,


In [7]:
labour_sa.describe()

Unnamed: 0,TIME_PERIOD,OBS_VALUE,DECIMALS,UNIT_MULT,COMMENT_OBS
count,35.0,35.0,35.0,35.0,0.0
mean,2007.0,59.423029,2.0,0.0,
std,10.246951,2.574678,0.0,0.0,
min,1990.0,54.171,2.0,0.0,
25%,1998.5,57.829,2.0,0.0,
50%,2007.0,59.802,2.0,0.0,
75%,2015.5,61.9855,2.0,0.0,
max,2024.0,62.569,2.0,0.0,


Cleaning Data 

In [8]:
labour_sa = labour_sa.drop(columns=["COMMENT_OBS"])
edu_sa = edu_sa.drop(columns=["COMMENT_OBS"])

In [9]:
edu_clean = edu_sa[[
    "TIME_PERIOD", "REF_AREA_LABEL", "INDICATOR_LABEL",
    "SEX_LABEL", "AGE_LABEL", "OBS_VALUE"
]]

labour_clean = labour_sa[[
    "TIME_PERIOD", "REF_AREA_LABEL", "INDICATOR_LABEL",
    "SEX_LABEL", "AGE_LABEL", "OBS_VALUE"
]]

In [10]:
labour_clean.describe()

Unnamed: 0,TIME_PERIOD,OBS_VALUE
count,35.0,35.0
mean,2007.0,59.423029
std,10.246951,2.574678
min,1990.0,54.171
25%,1998.5,57.829
50%,2007.0,59.802
75%,2015.5,61.9855
max,2024.0,62.569


In [11]:
edu_clean.describe()

Unnamed: 0,TIME_PERIOD,OBS_VALUE
count,13.0,13.0
mean,2016.846154,8.306698
std,4.140172,2.77501
min,2010.0,4.839276
25%,2014.0,6.038021
50%,2017.0,8.02
75%,2020.0,11.108956
max,2023.0,12.96


# Question 2

In [12]:
import numpy as np

In [13]:
labour_avg = np.mean(labour_clean["OBS_VALUE"])
edu_avg = np.mean(edu_clean["OBS_VALUE"])

print("Average Labour Participation Rate:", labour_avg)
print("Average Education Attainment of bachelors degree:", edu_avg)

Average Labour Participation Rate: 59.423028571428574
Average Education Attainment of bachelors degree: 8.306698102217451


In [14]:
labour_trend = labour_clean.groupby("TIME_PERIOD")["OBS_VALUE"].mean().values
edu_trend = edu_clean.groupby("TIME_PERIOD")["OBS_VALUE"].mean().values

labour_growth = np.diff(labour_trend)
edu_growth = np.diff(edu_trend)

In [15]:
merged = pd.merge(
    edu_clean,
    labour_clean,
    on=["TIME_PERIOD", "REF_AREA_LABEL", "SEX_LABEL", "AGE_LABEL"],
    suffixes=("_edu", "_labour")
)

correlation = np.corrcoef(
    merged["OBS_VALUE_edu"], merged["OBS_VALUE_labour"]
)[0,1]

print("Correlation between Education Attainment and Labour Participation:", correlation)

Correlation between Education Attainment and Labour Participation: 0.37881527406726834


In [16]:
# Export cleaned and merged dataframes to folder 4 for database integration
edu_clean.to_csv('../4. Database Intergration (20 marks)/edu_clean.csv', index=False)
labour_clean.to_csv('../4. Database Intergration (20 marks)/labour_clean.csv', index=False)
merged.to_csv('../4. Database Intergration (20 marks)/merged.csv', index=False)

# Numerical Analysis
This notebook builds on the data preparation phase and performs deeper numerical analysis on the South African education and labour datasets.

## 1. Load Libraries & Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
edu_df = pd.read_csv('education.csv')
labour_df = pd.read_csv('labour.csv')

## 2. Data Preparation (Filtering for South Africa, Selecting Columns)

In [None]:
edu_sa = edu_df[edu_df['REF_AREA_LABEL'] == 'South Africa'].copy()
labour_sa = labour_df[labour_df['REF_AREA_LABEL'] == 'South Africa'].copy()
columns_to_keep = {
    'TIME_PERIOD': 'Year',
    'INDICATOR_LABEL': 'Indicator',
    'SEX_LABEL': 'Sex',
    'AGE_LABEL': 'Age',
    'OBS_VALUE': 'Value'
}
edu_clean = edu_sa[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
labour_clean = labour_sa[list(columns_to_keep.keys())].rename(columns=columns_to_keep)

## 3. Descriptive Statistics (Mean, Median, Mode, Std, Min, Max)

In [None]:
# Education statistics
edu_stats = edu_clean['Value'].describe()
edu_mean = edu_clean['Value'].mean()
edu_median = edu_clean['Value'].median()
edu_mode = edu_clean['Value'].mode()[0]
edu_std = edu_clean['Value'].std()

# Labour statistics
labour_stats = labour_clean['Value'].describe()
labour_mean = labour_clean['Value'].mean()
labour_median = labour_clean['Value'].median()
labour_mode = labour_clean['Value'].mode()[0]
labour_std = labour_clean['Value'].std()

print('Education Stats:', edu_stats)
print('Mean:', edu_mean, 'Median:', edu_median, 'Mode:', edu_mode, 'Std:', edu_std)
print('Labour Stats:', labour_stats)
print('Mean:', labour_mean, 'Median:', labour_median, 'Mode:', labour_mode, 'Std:', labour_std)

## 4. Grouped Analysis (By Sex, Age, Indicator)

In [None]:
# Group by Sex
edu_by_sex = edu_clean.groupby('Sex')['Value'].describe()
labour_by_sex = labour_clean.groupby('Sex')['Value'].describe()

# Group by Age
edu_by_age = edu_clean.groupby('Age')['Value'].describe()
labour_by_age = labour_clean.groupby('Age')['Value'].describe()

print('Education by Sex:', edu_by_sex)
print('Labour by Sex:', labour_by_sex)
print('Education by Age:', edu_by_age)
print('Labour by Age:', labour_by_age)

## 5. Correlation Analysis (Year Matching)

In [None]:
# Merge datasets by Year for correlation
edu_year = edu_clean.groupby('Year')['Value'].mean().reset_index().rename(columns={'Value': 'Education_Value'})
labour_year = labour_clean.groupby('Year')['Value'].mean().reset_index().rename(columns={'Value': 'Labour_Value'})
merged = pd.merge(edu_year, labour_year, on='Year')

correlation = merged['Education_Value'].corr(merged['Labour_Value'])
print('Correlation between Education and Labour (by Year):', correlation)

## 6. Visualization (Histograms, Scatter, Boxplots)

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(edu_clean['Value'], kde=True)
plt.title('Education Attainment Distribution')
plt.subplot(1, 2, 2)
sns.histplot(labour_clean['Value'], kde=True)
plt.title('Labour Force Participation Distribution')
plt.tight_layout()
plt.show()

# Scatter plot for merged data
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Education_Value', y='Labour_Value', data=merged)
plt.title('Education vs Labour (Yearly Average)')
plt.xlabel('Education Value')
plt.ylabel('Labour Value')
plt.show()

# Boxplot by Sex
plt.figure(figsize=(12, 5))
sns.boxplot(x='Sex', y='Value', data=edu_clean)
plt.title('Education by Sex')
plt.show()