In [74]:
# load Pisa201520182022_GameDataset-2_altered.csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load data
data = pd.read_csv('Pisa201520182022_GameDataset-2_altered.csv')
# drop IMMIG
data = data.drop(['IMMIG'], axis=1)
# drop PAREDINT and IMMIG
data = data.drop(['PAREDINT'], axis=1)


# remove nan
print(data.shape)
data = data.dropna()
print(data.shape)

# print df.columns
print(data.columns)



# make string
# make dummies     'CNT'  'WAVE', 'ST004D01T', 'PAREDINT', 'IMMIG' # create dense naming
# month encoded to ints
data['Month_encoded'] = data['Month_encoded'].astype(int)
data = pd.get_dummies(data, columns=[ 'WAVE', 'ST004D01T',  'Month_encoded'], drop_first=True,  )


# for ESCS make dummies lowest 25% and highest 25%
data['ESCS'] = pd.qcut(data['ESCS'], q=4, labels=False)
# run pd.get_dummies on ESCS
data = pd.get_dummies(data, columns=['ESCS'])
# drop second and third
data = data.drop(['ESCS_1', 'ESCS_2'], axis=1)



# de mean average_math and average_read per country
data['average_math'] = data.groupby('CNT')['average_math'].transform(lambda x: x - x.mean())
data['average_read'] = data.groupby('CNT')['average_read'].transform(lambda x: x - x.mean())

# drop 'CNT' column
data = data.drop(['CNT'], axis=1)

# make column names valid such that they can enter smf.quantreg
data.columns = data.columns.str.replace(' ', '_')
data.columns = data.columns.str.replace('-', '_')
data.columns = data.columns.str.replace('(', '_')
data.columns = data.columns.str.replace(')', '_')
data.columns = data.columns.str.replace('.', '_')
data.columns = data.columns.str.replace(',', '_')
data.columns = data.columns.str.replace('>', '_')
data.columns = data.columns.str.replace('<', '_')
data.columns = data.columns.str.replace('=', '_')
data.columns = data.columns.str.replace('?', '_')
data.columns = data.columns.str.replace(':', '_')

# drop W_FSTUWT
weights = data['W_FSTUWT']
data = data.drop(['W_FSTUWT'], axis=1)

# remove W_FSTUWT
data = data.astype(float)




(542887, 10)
(525375, 10)
Index(['CNT', 'threshold', 'Month_encoded', 'average_math', 'average_read',
       'WAVE', 'ST004D01T', 'ESCS', 'W_FSTUWT', 'month_centered'],
      dtype='object')


In [75]:
# add interaction between threshold and all columns
for col in data.columns:
    if col != 'average_math' and col != 'average_read' and col !='threshold':
        data[col + '_x_threshold'] = data[col] * data['threshold']

In [76]:
# import quantile regression 
import statsmodels.formula.api as smf

# quantile regression with average_math on 'CNT', 'threshold', 'Month_encoded', 'WAVE', 'ST004D01T', 'PAREDINT', 'IMMIG'
X = data.drop(['average_math', 'average_read'], axis=1)
y = data['average_math']

print('average_math ~ ' + ' + '.join(X.columns))
# fit quantile regression
mod = smf.quantreg('average_math ~ ' + ' + '.join(X.columns), data)
res = mod.fit(q=.5)
print(res.summary())

# fit quantile regression
mod = smf.quantreg('average_math ~ ' + ' + '.join(X.columns), data)
res = mod.fit(q=0.1)
print(res.summary())

# fit quantile regression
mod = smf.quantreg('average_math ~ ' + ' + '.join(X.columns), data)
res = mod.fit(q=.9)
print(res.summary())



average_math ~ threshold + month_centered + WAVE_2018 + WAVE_2022 + ST004D01T_Male + Month_encoded_2 + Month_encoded_3 + Month_encoded_4 + Month_encoded_5 + Month_encoded_6 + Month_encoded_7 + Month_encoded_8 + Month_encoded_9 + Month_encoded_10 + Month_encoded_11 + Month_encoded_12 + ESCS_0 + ESCS_3 + month_centered_x_threshold + WAVE_2018_x_threshold + WAVE_2022_x_threshold + ST004D01T_Male_x_threshold + Month_encoded_2_x_threshold + Month_encoded_3_x_threshold + Month_encoded_4_x_threshold + Month_encoded_5_x_threshold + Month_encoded_6_x_threshold + Month_encoded_7_x_threshold + Month_encoded_8_x_threshold + Month_encoded_9_x_threshold + Month_encoded_10_x_threshold + Month_encoded_11_x_threshold + Month_encoded_12_x_threshold + ESCS_0_x_threshold + ESCS_3_x_threshold
                         QuantReg Regression Results                          
Dep. Variable:           average_math   Pseudo R-squared:              0.06178
Model:                       QuantReg   Bandwidth:         

In [66]:
# regress threshold and month_cented and interaction on ST004D01T, IMMIG_Native_student, PAREDINT, ESCS independently
# OLS
import statsmodels.api as sm

endogs = ['ST004D01T_Male', 'IMMIG_Native_student', 'PAREDINT', 'ESCS_0', 'ESCS_3']

X = sm.add_constant(data[['threshold', 'month_centered', 'month_centered_x_threshold', 'Month_encoded_2', 'Month_encoded_3', 'Month_encoded_4', 'Month_encoded_5', 'Month_encoded_6', 'Month_encoded_7', 'Month_encoded_8', 'Month_encoded_9', 'Month_encoded_10', 'Month_encoded_11', 'Month_encoded_12']])


for endog in endogs:
    y = data[endog]

    # ols with X on y 
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())



                            OLS Regression Results                            
Dep. Variable:         ST004D01T_Male   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.056
Date:                Thu, 18 Apr 2024   Prob (F-statistic):             0.0112
Time:                        13:02:55   Log-Likelihood:            -3.6422e+05
No. Observations:              501897   AIC:                         7.285e+05
Df Residuals:                  501882   BIC:                         7.286e+05
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               