In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('csv/K15T_K16_K17.csv')

In [2]:
# Replace spaces with underscores in column headings
df.columns = df.columns.str.replace(' ', '_')

ord_list = ['Age_Group']
cat_list = [col for col in df.columns if (df[col].dtype == 'object') and col not in ord_list]
num_list = [col for col in df.columns if df[col].dtype == 'int' or df[col].dtype == 'float' and col not in ord_list]

for col in ord_list:
    print(f'======== {col} =========')
    print(df[col].unique())

for col in cat_list:
    print(f'======== {col} =========')
    print(df[col].unique())

['15 - 29' '30 - 39' '40 - 49' '50 & Over']
['Male' 'Female']
['Long-term' 'Short-term']
['Associate Professionals & Technicians'
 'Managers & Administrators (Including Working Proprietors)'
 'Professionals' 'Cleaners, Labourers & Related Workers'
 'Clerical Support Workers' 'Craftsmen & Related Trades Workers' 'Others'
 'Plant & Machine Operators & Assemblers' 'Service & Sales Workers']
['PMETs' 'Non-PMETs']


In [3]:
# Handle ordinal list
Ord_dict_list = {}
for col in ord_list:
    if col in df.columns:
        Ord_elements = df[col].unique()
        Counter = 0
        mapping = {}
        for element in Ord_elements:
            mapping[element] = Counter
            Counter += 1 
        # print(mapping)
        Ord_dict_list[col] = mapping

print(Ord_dict_list)

# Encode for ordinal list
df_encoded = df.copy()
for item in Ord_dict_list:
    # print(Ord_dict_list[item])
    if item in df.columns: 
        df_encoded[f'{item}_Encoded'] = df[item].map(Ord_dict_list[item])
        df_encoded.drop(item, axis = 1, inplace= True)
        df_encoded.rename({f'{item}_Encoded' : item}, axis= 1, inplace=True)
        # print(df[f'{item}_Encoded'])

df_encoded.head()


{'Age_Group': {'15 - 29': 0, '30 - 39': 1, '40 - 49': 2, '50 & Over': 3}}


Unnamed: 0,Year,Gender,Type,Occupation,Pmet_Status,Count_(Thousands),Age_Group
0,2014,Male,Long-term,Associate Professionals & Technicians,PMETs,0.286364,0
1,2014,Male,Short-term,Associate Professionals & Technicians,PMETs,1.317603,0
2,2014,Female,Long-term,Associate Professionals & Technicians,PMETs,0.0,0
3,2014,Female,Short-term,Associate Professionals & Technicians,PMETs,1.629557,0
4,2014,Male,Long-term,Managers & Administrators (Including Working P...,PMETs,0.182998,0


In [4]:
# Replace spaces with underscores in categorical columns
for col in cat_list:
    df_encoded[col] = df_encoded [col].str.replace(' ', '_')

# One-hot encoding for cat_list less occupation because do not want to drop first of occupation
cat_list_less_occupation = [col for col in cat_list if col != 'Occupation']
df_encoded = pd.get_dummies(df_encoded, columns=cat_list_less_occupation, drop_first=True)

# Display the encoded dataframe and properties
# print(df_encoded.head())
print(df_encoded.info())
print(df_encoded.shape)
print(df_encoded.isnull().sum().any())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584 entries, 0 to 1583
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               1584 non-null   int64  
 1   Occupation         1584 non-null   object 
 2   Count_(Thousands)  1584 non-null   float64
 3   Age_Group          1584 non-null   int64  
 4   Gender_Male        1584 non-null   bool   
 5   Type_Short-term    1584 non-null   bool   
 6   Pmet_Status_PMETs  1584 non-null   bool   
dtypes: bool(3), float64(1), int64(2), object(1)
memory usage: 54.3+ KB
None
(1584, 7)
False


In [None]:
# encode occupation manually to drop last of occupation that is "others"
dummies = pd.get_dummies(df_encoded['Occupation'])
dummies.drop('Others', axis=1,inplace = True)
dummies

df_encoded = pd.concat([df_encoded, dummies], axis=1)
df_encoded.drop(['Occupation'], axis=1, inplace=True)
df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584 entries, 0 to 1583
Data columns (total 14 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Year                                                       1584 non-null   int64  
 1   Count_(Thousands)                                          1584 non-null   float64
 2   Age_Group                                                  1584 non-null   int64  
 3   Gender_Male                                                1584 non-null   bool   
 4   Type_Short-term                                            1584 non-null   bool   
 5   Pmet_Status_PMETs                                          1584 non-null   bool   
 6   Associate_Professionals_&_Technicians                      1584 non-null   bool   
 7   Cleaners,_Labourers_&_Related_Workers                      1584 non-null   bool   
 8   Clerical

In [6]:
# to handle fixed time effects
df_encoded = pd.get_dummies(df_encoded, columns=['Year'], drop_first=True)

# Convert all Boolean columns (True/False) to 1/0
for col in df_encoded.columns:
    if col not in num_list and col not in ord_list and col not in cat_list: 
        df_encoded[col] = df_encoded[col].astype(int)

print(df_encoded.info())
print(df_encoded.shape)
df_encoded.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584 entries, 0 to 1583
Data columns (total 23 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Count_(Thousands)                                          1584 non-null   float64
 1   Age_Group                                                  1584 non-null   int64  
 2   Gender_Male                                                1584 non-null   int32  
 3   Type_Short-term                                            1584 non-null   int32  
 4   Pmet_Status_PMETs                                          1584 non-null   int32  
 5   Associate_Professionals_&_Technicians                      1584 non-null   int32  
 6   Cleaners,_Labourers_&_Related_Workers                      1584 non-null   int32  
 7   Clerical_Support_Workers                                   1584 non-null   int32  
 8   Craftsme

Unnamed: 0,Count_(Thousands),Age_Group,Gender_Male,Type_Short-term,Pmet_Status_PMETs,Associate_Professionals_&_Technicians,"Cleaners,_Labourers_&_Related_Workers",Clerical_Support_Workers,Craftsmen_&_Related_Trades_Workers,Managers_&_Administrators_(Including_Working_Proprietors),...,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020,Year_2021,Year_2022,Year_2023,Year_2024
0,0.286364,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.317603,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.629557,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.182998,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import statsmodels.api as sm

def pooled_linear_regression(df, y_label):
    # Define features and target
    X = df.drop(y_label, axis=1)
    y = df[y_label]

    # Add constant term for intercept
    X = sm.add_constant(X)

    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()

    # Print the full summary of regression results
    return model.summary()

In [8]:
# First run 
pooled_linear_regression(df_encoded, 'Count_(Thousands)')

0,1,2,3
Dep. Variable:,Count_(Thousands),R-squared:,0.454
Model:,OLS,Adj. R-squared:,0.447
Method:,Least Squares,F-statistic:,61.88
Date:,"Fri, 12 Sep 2025",Prob (F-statistic):,7.38e-188
Time:,00:32:32,Log-Likelihood:,-1300.7
No. Observations:,1584,AIC:,2645.0
Df Residuals:,1562,BIC:,2763.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.4959,0.066,-7.462,0.000,-0.626,-0.366
Age_Group,0.0627,0.012,5.039,0.000,0.038,0.087
Gender_Male,0.0393,0.028,1.411,0.159,-0.015,0.094
Type_Short-term,0.7035,0.028,25.277,0.000,0.649,0.758
Pmet_Status_PMETs,0.6021,0.036,16.653,0.000,0.531,0.673
Associate_Professionals_&_Technicians,0.3365,0.036,9.307,0.000,0.266,0.407
"Cleaners,_Labourers_&_Related_Workers",0.4078,0.059,6.907,0.000,0.292,0.524
Clerical_Support_Workers,0.8737,0.059,14.798,0.000,0.758,0.990
Craftsmen_&_Related_Trades_Workers,0.1209,0.059,2.047,0.041,0.005,0.237

0,1,2,3
Omnibus:,404.377,Durbin-Watson:,2.26
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1391.374
Skew:,1.233,Prob(JB):,7.36e-303
Kurtosis:,6.873,Cond. No.,1.79e+16


In [9]:
# tackling multicollinearity
# step 1: drop PMET_Status
df_encoded.drop('Pmet_Status_PMETs', axis =1 , inplace= True)
df_encoded.columns
pooled_linear_regression(df_encoded, 'Count_(Thousands)')

# step 2: VIF


0,1,2,3
Dep. Variable:,Count_(Thousands),R-squared:,0.454
Model:,OLS,Adj. R-squared:,0.447
Method:,Least Squares,F-statistic:,61.88
Date:,"Fri, 12 Sep 2025",Prob (F-statistic):,7.38e-188
Time:,00:32:32,Log-Likelihood:,-1300.7
No. Observations:,1584,AIC:,2645.0
Df Residuals:,1562,BIC:,2763.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.4959,0.066,-7.462,0.000,-0.626,-0.366
Age_Group,0.0627,0.012,5.039,0.000,0.038,0.087
Gender_Male,0.0393,0.028,1.411,0.159,-0.015,0.094
Type_Short-term,0.7035,0.028,25.277,0.000,0.649,0.758
Associate_Professionals_&_Technicians,0.9386,0.059,15.897,0.000,0.823,1.054
"Cleaners,_Labourers_&_Related_Workers",0.4078,0.059,6.907,0.000,0.292,0.524
Clerical_Support_Workers,0.8737,0.059,14.798,0.000,0.758,0.990
Craftsmen_&_Related_Trades_Workers,0.1209,0.059,2.047,0.041,0.005,0.237
Managers_&_Administrators_(Including_Working_Proprietors),0.5761,0.059,9.757,0.000,0.460,0.692

0,1,2,3
Omnibus:,404.377,Durbin-Watson:,2.26
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1391.374
Skew:,1.233,Prob(JB):,7.36e-303
Kurtosis:,6.873,Cond. No.,25.2


In [10]:
# Residual error analysis

In [11]:


# F-test method to test if significance - covid and then split it up...

