# Multinomial Logit — **Same preprocessing as OLS**, modelling step swapped
This notebook mirrors the OLS pipeline (winsorisation, standardisation, dummies, interaction),
then fits a multinomial logit on `SGrowth_2`. It auto-detects the base category and derives
`High Growth vs Stressed` if needed. Exports odds ratios and marginal effects to Excel.

In [26]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats.mstats import winsorize
from IPython.display import display

In [27]:
# === LOAD DATA ===
FILE = '2000 Analysis Python CHANGED.xlsx'
df = pd.read_excel(FILE)
print('Loaded:', df.shape)


Loaded: (9050, 81)


In [28]:
# === PARAMETERS (match OLS) ===
target_cont = 'LSGrowth_2023_2019'                 # continuous growth used in OLS
winsor_limits = (0.025, 0.025)                     # 2.5% tails as in OLS
core_cont = ['Starting_INT','Starting_BS_Strength','Starting_Size_ln','Starting_Profit']
cat_target = 'SGrowth_2'                           # categorical for MNLogit
sector_ref = 'budownictwo'                         # OLS reference sector
pf_col = 'P/F'                                     # ownership P/F
hgx_col = 'HGX'                                    # Increased/Decreased Export Intensity


In [29]:
# === WINSORISE CONTINUOUS DV (as in OLS) ===
if target_cont in df.columns:
    arr = df[target_cont].astype(float).to_numpy()
    df[target_cont + '_win'] = winsorize(arr, limits=winsor_limits)
    print('Winsorised DV created:', target_cont + '_win')
else:
    print('Warning: continuous OLS DV not found; continuing with multinomial anyway.')


Winsorised DV created: LSGrowth_2023_2019_win


In [30]:
# === STANDARDISE core continuous predictors (z-score), as in OLS ===
for c in core_cont:
    if c in df.columns:
        mu, sd = df[c].mean(), df[c].std(ddof=0)
        df[c + '_z'] = (df[c] - mu) / (sd if sd else 1.0)
    else:
        raise KeyError(f'Missing predictor: {c}')
print('Standardised predictors:', [c + '_z' for c in core_cont])


Standardised predictors: ['Starting_INT_z', 'Starting_BS_Strength_z', 'Starting_Size_ln_z', 'Starting_Profit_z']


In [31]:
# === DUMMIES to match OLS ===
# Foreign dummy from P/F (F=1, P=0)
df['Foreign'] = (df[pf_col].astype(str).str.upper().str.strip() == 'F').astype(int)

# HGX Increased Export Intensity dummy (reference = Decreased)
df['HGX_Increased Export Intensity'] = (df[hgx_col].astype(str).str.strip() == 'Increased Export Intensity').astype(int)

# Sector dummies, drop reference sector
sector_d = pd.get_dummies(df['Sector'], prefix='Sector', drop_first=False)
ref_col = f'Sector_{sector_ref}'
if ref_col in sector_d.columns:
    sector_d = sector_d.drop(columns=[ref_col])
print('Sector dummies:', len(sector_d.columns), ' (ref =', sector_ref, ')')


Sector dummies: 13  (ref = budownictwo )


In [32]:
# === INTERACTION (as in OLS) ===
df['Starting_INT_Mult_Starting_Size_ln'] = df['Starting_INT'] * df['Starting_Size_ln']

# Collect the design matrix columns mirroring OLS structure (z-scored cores + dummies + interaction)
X_cols = [c + '_z' for c in core_cont] + ['Foreign','HGX_Increased Export Intensity','Starting_INT_Mult_Starting_Size_ln']
X = pd.concat([df[X_cols], sector_d], axis=1)

# Align with categorical target
cat_map = {'0.Stressed':0,'1.Declining':1,'2.Stable':2,'3.High Growth':3}
y = df[cat_target].map(cat_map)
mask = (~y.isna()) & (~X.isna().any(axis=1))
X = X.loc[mask].copy()
y = y.loc[mask].astype(int)
print('Model N:', X.shape[0], 'Features:', X.shape[1])


Model N: 9015 Features: 20


In [33]:
# === SAFETY: ensure numeric dtypes to avoid object errors ===
Xc = sm.add_constant(X, has_constant='add')
Xc = Xc.astype(float)
y = y.astype(int)
print('dtypes:', Xc.dtypes.value_counts().to_dict())


dtypes: {dtype('float64'): 21}


In [34]:
# === FIT MULTINOMIAL LOGIT (robust version) ===
import warnings
import numpy as np
import pandas as pd
import statsmodels.api as sm

# 0) Ensure numeric types to avoid "object" dtype errors
Xc = sm.add_constant(X, has_constant='add')
Xc = Xc.astype(float)
y  = y.astype(int)

# 1) Quick rank diagnostics (helps understand "Singular matrix")
rank = np.linalg.matrix_rank(Xc.values)
print(f"Design shape: {Xc.shape} | matrix rank: {rank} | deficiency: {Xc.shape[1]-rank}")
const_cols = [c for c in Xc.columns if Xc[c].nunique(dropna=True) <= 1]
if const_cols:
    print("Constant/empty columns detected and will be dropped:", const_cols)
    Xc = Xc.drop(columns=const_cols)

# 2) Re-check rank after pruning
rank = np.linalg.matrix_rank(Xc.values)
print(f"After pruning -> shape: {Xc.shape} | rank: {rank} | deficiency: {Xc.shape[1]-rank}")

# 3) Fit with stricter optimiser; capture warnings
with warnings.catch_warnings(record=True) as wlist:
    warnings.simplefilter("always")
    mnl = sm.MNLogit(y, Xc).fit(method="newton", maxiter=500, tol=1e-8, disp=True)

print("\n=== Standard summary ===")
display(mnl.summary())

# 5) If you saw a ConvergenceWarning or singularity, consider pooling rare sectors (see Option A+: pooling)
if wlist:
    print("\nWarnings during fit:")
    for wi in wlist:
        print(f"- {wi.category.__name__}: {wi.message}")


Design shape: (9015, 21) | matrix rank: 21 | deficiency: 0
Constant/empty columns detected and will be dropped: ['const']
After pruning -> shape: (9015, 20) | rank: 20 | deficiency: 0
         Current function value: 1.183813
         Iterations: 500

=== Standard summary ===


0,1,2,3
Dep. Variable:,SGrowth_2,No. Observations:,9015.0
Model:,MNLogit,Df Residuals:,8955.0
Method:,MLE,Df Model:,57.0
Date:,"Fri, 19 Sep 2025",Pseudo R-squ.:,0.04588
Time:,15:11:21,Log-Likelihood:,-10672.0
converged:,False,LL-Null:,-11185.0
Covariance Type:,nonrobust,LLR p-value:,8.557000000000001e-178

SGrowth_2=1,coef,std err,z,P>|z|,[0.025,0.975]
Starting_INT_z,-0.6045,0.108,-5.607,0.000,-0.816,-0.393
Starting_BS_Strength_z,0.2940,0.038,7.752,0.000,0.220,0.368
Starting_Size_ln_z,0.1038,0.037,2.795,0.005,0.031,0.177
Starting_Profit_z,0.1071,0.040,2.698,0.007,0.029,0.185
Foreign,-0.0341,0.073,-0.467,0.640,-0.177,0.109
HGX_Increased Export Intensity,0.0390,0.071,0.547,0.585,-0.101,0.179
Starting_INT_Mult_Starting_Size_ln,0.0012,0.000,5.125,0.000,0.001,0.002
Sector_chemia,0.4970,0.218,2.280,0.023,0.070,0.924
Sector_energetyka,-0.7526,0.360,-2.088,0.037,-1.459,-0.046
Sector_górnictwo i hutnictwo,0.7953,0.269,2.961,0.003,0.269,1.322





In [36]:
# === MARGINAL EFFECTS (average dydx) ===
mfx = mnl.get_margeff(at='overall', method='dydx').summary_frame()
display(mfx.head(40))

Unnamed: 0_level_0,Unnamed: 1_level_0,dy/dx,Std. Err.,z,Pr(>|z|),Conf. Int. Low,Cont. Int. Hi.
endog,exog,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SGrowth_2=0,Starting_INT_z,0.059944,0.01046,5.730589,1.000823e-08,0.039442,0.080446
SGrowth_2=0,Starting_BS_Strength_z,-0.031532,0.003846,-8.198785,2.428289e-16,-0.03907,-0.023994
SGrowth_2=0,Starting_Size_ln_z,-0.015485,0.003882,-3.988908,6.637817e-05,-0.023094,-0.007877
SGrowth_2=0,Starting_Profit_z,-0.01773,0.004092,-4.332403,1.47491e-05,-0.025751,-0.009709
SGrowth_2=0,Foreign,0.017566,0.007572,2.319903,0.02034611,0.002725,0.032406
SGrowth_2=0,HGX_Increased Export Intensity,-0.020203,0.007406,-2.728004,0.006371888,-0.034718,-0.005688
SGrowth_2=0,Starting_INT_Mult_Starting_Size_ln,-0.000105,2.3e-05,-4.622261,3.795803e-06,-0.00015,-6.1e-05
SGrowth_2=0,Sector_chemia,-0.048783,0.022599,-2.15866,0.03087655,-0.093075,-0.00449
SGrowth_2=0,Sector_energetyka,0.053683,0.032757,1.638835,0.1012476,-0.010519,0.117886
SGrowth_2=0,Sector_górnictwo i hutnictwo,0.437639,151998.29986,3e-06,0.9999977,-297910.755798,297911.631076


In [37]:
# === EXPORT to Excel ===
with pd.ExcelWriter('MNLogit_SamePreprocAs_OLS.xlsx') as w:
    for k in or_tables:
        or_tables[k].to_excel(w, sheet_name=f'OR_cat{k}_vs_base')
    if derived_hg_vs_0 is not None:
        derived_hg_vs_0.to_excel(w, sheet_name='3_vs_0_Derived')
    mfx.to_excel(w, sheet_name='MarginalEffects_overall')
print('✅ Exported MNLogit_SamePreprocAs_OLS.xlsx')


✅ Exported MNLogit_SamePreprocAs_OLS.xlsx


Notes:
- Preprocessing matches OLS: **winsorisation**, **z-scored predictors**, **Foreign**, **HGX Increased**, **sector** dummies, **INT×Size**.
- Model step is swapped to **MNLogit**.
- If the base is High Growth (3), the notebook also **derives High Growth vs Stressed** ORs automatically.
- All predictors are coerced to float before fitting to avoid dtype errors.