In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import numpy as np

In [None]:
df = pd.read_csv("/Users/nicolasroever/Dropbox/Promotion/LVT/landvaluetax/bld/data/merged_estonia_eurostat.csv")

In [None]:
df['average_price_per_sqm'] = pd.to_numeric(df['average_price_per_sqm'], errors='coerce')
df['general_rate'] = pd.to_numeric(df['general_rate'], errors='coerce')
df["ln_avg_price_per_sqm"] = df["average_price_per_sqm"].apply(lambda x: np.log(x) if x > 0 else np.nan)

In [None]:
# Summary statistics
summary = df[['average_price_per_sqm', 'general_rate']].describe()

# Time series aggregation by year_quarter (sorted)
ts = df.groupby(['year', 'year_quarter'])[['average_price_per_sqm', 'general_rate', 'HPI']] \
       .mean(numeric_only=True).reset_index().sort_values(['year', 'year_quarter'])


# Scatter plot: average_price_per_sqm vs. general_rate
plt.figure()
plt.scatter(df['general_rate'], df['average_price_per_sqm'])
plt.xlabel('General Rate')
plt.ylabel('Average Price per sqm')
plt.title('Scatter: Avg Price per sqm vs General Rate')
plt.tight_layout()
plt.show()

# Time series plot: average_price_per_sqm over quarters
fig, ax1 = plt.subplots()
ax1.plot(ts['year_quarter'], ts['average_price_per_sqm'], color='tab:blue', label='Avg Price per sqm')
ax1.set_xlabel('Year-Quarter')
ax1.set_ylabel('Mean Avg Price per sqm', color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_xticklabels(ts['year_quarter'], rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.plot(ts['year_quarter'], ts['HPI'], color='tab:orange', label='HPI', linestyle='--')
ax2.set_ylabel('HPI', color='tab:orange')
ax2.tick_params(axis='y', labelcolor='tab:orange')

plt.title('Time Series: Mean Avg Price per sqm & HPI by Quarter')
fig.tight_layout()
plt.show()

# Time series plot: general_rate over quarters
plt.figure()
plt.plot(ts['year_quarter'], ts['general_rate'])
plt.xlabel('Year-Quarter')
plt.ylabel('Mean General Rate')
plt.title('Time Series: Mean General Rate by Quarter')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Create year_quarter if not present
if 'year_quarter' not in df.columns and 'year' in df.columns and 'quarter' in df.columns:
    df['year_quarter'] = df['year'].astype(str) + ' ' + df['quarter']

# Define mapping for Roman numerals to order
roman_map = {'I':1, 'II':2, 'III':3, 'IV':4}

# Extract and order the quarters chronologically
unique_q = df['year_quarter'].dropna().unique()
order = sorted(unique_q, key=lambda x: (int(x.split()[0]), roman_map.get(x.split()[1], 0)))

# Convert to categorical with ordered quarters
df['year_quarter'] = pd.Categorical(df['year_quarter'], categories=order, ordered=True)

# Plot scatter of all observations
# Add random noise to general_rate for visualization
np.random.seed(42)
noise = np.random.normal(0, 0.03, size=len(df))
plt.figure(figsize=(12, 6))
plt.scatter(df['year_quarter'], df['general_rate'] + noise, alpha=0.6)
plt.xlabel('Year-Quarter')
plt.ylabel('General Rate (with noise)')
plt.title('Scatter: General Rate by Quarter (All Observations, Noisy)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Parse year and quarter from year_quarter column
roman_to_int = {'I': 1, 'II': 2, 'III': 3, 'IV': 4}
month_map = {1: 1, 2: 4, 3: 7, 4: 10}

df['year_quarter'] = df['year_quarter'].astype(str)
parts = df['year_quarter'].str.split()
df['year'] = parts.str[0].astype(int)
df['quarter_str'] = parts.str[1]
df['quarter_num'] = df['quarter_str'].map(roman_to_int)
df['month'] = df['quarter_num'].map(month_map)
df['quarter_start'] = pd.to_datetime(dict(year=df['year'], month=df['month'], day=1))

# Plot all municipalities in grey
plt.figure(figsize=(12, 6))
for mun, group in df.groupby('municipality'):
    grp = group.sort_values('quarter_start')
    plt.plot(grp['quarter_start'], grp['general_rate'], color='grey', linewidth=0.5, alpha=0.6)

plt.xlabel('Quarter')
plt.ylabel('General Rate')
plt.title('General Rate Time Series by Municipality')
plt.tight_layout()
plt.show()

In [None]:
# Fixed effects regressions
model_simple = smf.ols('ln_avg_price_per_sqm ~ general_rate + HPI + C(municipality)', data=df).fit()
print(model_simple.summary())
model_both = smf.ols('ln_avg_price_per_sqm ~ general_rate + C(municipality) + C(county) + C(house_type) + HPI', data=df).fit()
print(model_both.summary())