In [70]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy.linalg import eigh
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.tsatools import detrend


In [71]:
def estimate_factors(X_df, r, max_iter=100, tol=1e-6):
    """
    Estimate missing values in X using the EM algorithm with factor analysis.
    
    Parameters:
    X_df : DataFrame (N, T)
        Panel data with missing values as np.nan.
    r : int
        Number of factors.
    max_iter : int, optional
        Maximum number of iterations.
    tol : float, optional
        Convergence tolerance.
    
    Returns:
    X_filled_df : DataFrame (N, T)
        Completed data matrix.
    F : DataFrame (T, r)
        Estimated factor matrix.
    """
    X = X_df.to_numpy()
    N, T = X.shape
    X_filled = X.copy()
    missing_mask = pd.isna(X)
    
    # Initialize missing values with column means
    col_means = np.nanmean(X, axis=0)
    X_filled[missing_mask] = np.take(col_means, np.where(missing_mask)[1])
    
    for _ in range(max_iter):
        X_old = X_filled.copy()
        
        # Compute SVD to get factor estimates
        # Compute covariance matrix
        S = (1 / N) * (X_filled.T @ X_filled)
        
        # Compute eigenvalues and eigenvectors
        eigenvalues, eigenvectors = eigh(S)
        
        # Select r largest eigenvectors
        F = eigenvectors[:, -r:]  # (T, r)
        
        # Estimate factor loadings
        Lambda = np.linalg.lstsq(F, X_filled.T, rcond=None)[0].T  # (N, r)
        
        # Reconstruct the data matrix
        X_filled = Lambda @ F.T
        
        # Restore observed values
        X_filled[~missing_mask] = X[~missing_mask]
        
        # Check for convergence
        if np.linalg.norm(X_filled - X_old, ord='fro') / np.linalg.norm(X_old, ord='fro') < tol:
            break
    
    X_filled_df = pd.DataFrame(X_filled, index=X_df.index, columns=X_df.columns)
    F_df = pd.DataFrame(F, index=X_df.columns, columns=[f'Factor_{i+1}' for i in range(r)])
    
    return X_filled_df, F_df


In [72]:
df         = pd.ExcelFile('Inflation-data.xlsx')
continents = pd.read_csv('country-and-continent-codes-list-csv.csv')

In [73]:
#continents.rename(columns={'Three_Letter_Country_Code':'Country Code'}, inplace=True)
continents = continents[['Continent_Name', 'Three_Letter_Country_Code']].dropna()

In [74]:
counts = continents[['Three_Letter_Country_Code']].value_counts(dropna = False)
val    = [i[0] for i in list(counts[counts > 1].index.values)]
print(continents[continents['Three_Letter_Country_Code'].isin(val)])
continents = continents.drop([8, 16, 59, 83, 116, 191, 234])

    Continent_Name Three_Letter_Country_Code
8           Europe                       AZE
9             Asia                       AZE
16          Europe                       ARM
17            Asia                       ARM
58          Europe                       CYP
59            Asia                       CYP
83          Europe                       GEO
84            Asia                       GEO
116         Europe                       KAZ
117           Asia                       KAZ
171        Oceania                       UMI
172  North America                       UMI
191         Europe                       RUS
192           Asia                       RUS
234         Europe                       TUR
235           Asia                       TUR


In [75]:
monthly_data = [i for i in df.sheet_names if i[-1] == 'm']

In [76]:
hcpi  = pd.read_excel(df, 'hcpi_m')

hcpi  = hcpi.dropna(subset = ['Country', 'Country Code'])
hcpi_ = pd.merge(hcpi, continents, left_on='Country Code', right_on = 'Three_Letter_Country_Code', how = 'left')
hcpi_.rename(columns={'Continent_Name':'Continent'}, inplace=True)
hcpi_.Continent.loc[(hcpi_['Continent'] == 'Oceania') | (hcpi_['Continent'] == 'Asia')] = 'Asia & Oceania'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hcpi_.Continent.loc[(hcpi_['Continent'] == 'Oceania') | (hcpi_['Continent'] == 'Asia')] = 'Asia & Oceania'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

In [77]:
x     = str(min([i for i in hcpi_.columns if type(i) == int]))

if len(x) > 4:
    minimum =  int(str(1980) + x[4 - len(x):])
    maximum =  int(str(2020) + x[4 - len(x):])
else:
    minimum = 1980
    maximum = 2021

In [78]:
hcpi_  = hcpi_[[i for i in hcpi_.columns.values if i in range(minimum, maximum) or i in ['Country', 'Continent']]]
hcpi__ = hcpi_.set_index(['Country', 'Continent']).T

mask = hcpi__.isna().rolling(window = 3).sum() == 3

dt = hcpi__.loc[:,~mask.any()]
dt = dt.astype('float64')

In [79]:
dt1, F_df = estimate_factors(dt, r = 8)

In [80]:
dt_cpi = dt1.copy()
cont = dt_cpi.columns.get_level_values(1).unique()
dt_cpi.columns.get_level_values(1).unique()

Index(['Europe', 'Africa', 'North America', 'South America', 'Asia & Oceania'], dtype='object', name='Continent')

In [81]:
for j,c in enumerate(cont):
    dt_cpi[('Regional Factor',c)] = dt_cpi.loc[:,[i for i in dt_cpi.columns.values if i[1] in cont[j]]].mean(axis = 1)
dt_cpi[('Global Factor', '')] = dt_cpi.iloc[:,:-5].mean(axis = 1)

In [82]:
df_inf = np.log(dt_cpi) - np.log(dt_cpi.shift(1))
df_inf.describe()

Country,Austria,Burundi,Belgium,Burkina Faso,Bahamas,Bolivia,Brazil,Barbados,Botswana,Canada,...,Uruguay,United States,Samoa,South Africa,Regional Factor,Regional Factor,Regional Factor,Regional Factor,Regional Factor,Global Factor
Continent,Europe,Africa,Europe,Africa,North America,South America,South America,North America,Africa,North America,...,South America,North America,Asia & Oceania,Africa,Europe,Africa,North America,South America,Asia & Oceania,Unnamed: 21_level_1
count,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,...,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0
mean,0.002033,0.00757,0.00217,0.002401,0.002516,0.028282,0.056802,0.003356,0.006901,0.002469,...,0.019837,0.002494,0.004771,0.006852,0.002812,0.006353,0.003983,0.011311,0.003837,0.004511
std,0.003899,0.020466,0.003133,0.019403,0.004362,0.090459,0.093272,0.008063,0.00587,0.003917,...,0.021226,0.003421,0.018489,0.005914,0.00312,0.005975,0.002829,0.013354,0.003235,0.00231
min,-0.010939,-0.05039,-0.006092,-0.124373,-0.017263,-0.025763,-0.00511,-0.041544,-0.014446,-0.010426,...,-0.007341,-0.019339,-0.049636,-0.011419,-0.008672,-0.026001,-0.006137,-0.00244,-0.008266,-0.00302
25%,0.0,-0.004434,0.000132,-0.006469,0.0,0.001623,0.004041,-0.001007,0.002817,0.0,...,0.004566,0.000682,-0.005787,0.002827,0.001075,0.003333,0.002287,0.003939,0.002052,0.003038
50%,0.001864,0.005163,0.001771,0.001596,0.001817,0.005233,0.00777,0.002884,0.006028,0.002212,...,0.011297,0.00242,0.003407,0.006072,0.002695,0.005876,0.003704,0.007412,0.003488,0.004296
75%,0.003847,0.017297,0.003739,0.010201,0.004022,0.012221,0.075047,0.007083,0.009849,0.004658,...,0.031854,0.004387,0.013283,0.010554,0.004902,0.00882,0.005237,0.015489,0.005261,0.005424
max,0.024078,0.157625,0.016097,0.117538,0.028521,1.039471,0.600977,0.066872,0.036834,0.025943,...,0.143388,0.015095,0.119523,0.035994,0.015882,0.043923,0.021071,0.130426,0.038438,0.020177


In [83]:
# First differences for hyperinflation
for country, region in df_inf:
    if country in ["Bolivia", "Brazil", "Peru"]:
        df_inf[(country, region)] = df_inf[(country, region)].diff

In [84]:
dummy_month = pd.get_dummies(list(range(1,13))*40, prefix="Month", drop_first=True)
df_inf[dummy_month.columns] = dummy_month.values

In [85]:
dt_cpi.to_csv('CPI.csv', index=True)  
df_inf.to_csv('Inflation.csv', index=True)  