In [None]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from bs4 import BeautifulSoup
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, ExtraTreesRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score

try:
    import pycountry_convert
    from pycountry_convert import country_name_to_country_alpha2, country_alpha2_to_continent_code
except ImportError:
    pycountry_convert = None

url = "https://www.numbeo.com/quality-of-life/rankings_current.jsp"
headers = {"User-Agent": "Mozilla/5.0"}

try:
    response = requests.get(url, headers=headers, timeout=20)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", {"id": "t2"})
    df = pd.read_html(StringIO(str(table)))[0]
except Exception as e:
    # Fallback: try direct pd.read_html on the page URL; if that fails, raise a clear error
    try:
        df = pd.read_html(url, header=0)[0]
    except Exception as e2:
        raise RuntimeError(f"Could not load table from Numbeo: {e}; fallback failed: {e2}")

df.columns = ["Rank","City","Quality_of_Life","Purchasing_Power","Safety","Health_Care",
              "Cost_of_Living","Property_Price_Income","Traffic_Time","Pollution","Climate"]
df = df.drop(columns=["Rank"])

df["City_Name"] = df["City"].apply(lambda x: x.split(",")[0].strip())
df["Country"] = df["City"].apply(lambda x: x.split(",")[-1].strip())

cols = ["Quality_of_Life","Purchasing_Power","Safety","Health_Care",
        "Cost_of_Living","Pollution","Climate","Property_Price_Income"]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

q_low = df["Cost_of_Living"].quantile(0.02)
q_hi  = df["Cost_of_Living"].quantile(0.98)
df = df[(df["Cost_of_Living"] > q_low) & (df["Cost_of_Living"] < q_hi)]

def get_continent(country_name):
    if pycountry_convert is None:
        return "Other"
    try:
        if country_name in ["United States", "USA"]: return "NA"
        code = country_name_to_country_alpha2(country_name)
        return country_alpha2_to_continent_code(code)
    except: return "Other"

df["Continent"] = df["Country"].apply(get_continent)

df["Refah_Puani"] = df["Purchasing_Power"] / (df["Property_Price_Income"] + 1) * 10
df["Pollution_Impact"] = df["Pollution"] * df["Health_Care"]

X = df[["Purchasing_Power", "Safety", "Health_Care", "Pollution", "Climate", "Continent", "Refah_Puani", "Pollution_Impact"]]
y = df["Cost_of_Living"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

preprocessor = ColumnTransformer([
    ("num", RobustScaler(), ["Purchasing_Power", "Safety", "Health_Care", "Pollution", "Climate", "Refah_Puani", "Pollution_Impact"]),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["Continent"])
])

reg1 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.04, max_depth=5, min_samples_leaf=4, random_state=42)
reg2 = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_leaf=3, random_state=42)
reg3 = ExtraTreesRegressor(n_estimators=300, max_depth=10, min_samples_leaf=3, random_state=42)

voting_model = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('et', reg3)])

model = Pipeline([("preprocessor", preprocessor), ("model", voting_model)])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2_val = r2_score(y_test, y_pred)

df["Tahmin"] = model.predict(X)
df["Fark"] = (df["Cost_of_Living"] - df["Tahmin"]).round(1)

X_clust = RobustScaler().fit_transform(df[["Purchasing_Power","Safety","Cost_of_Living"]])
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10).fit(X_clust)
df["Cluster_ID"] = kmeans.labels_

cluster_names = {}
for i in range(4):
    grp = df[df["Cluster_ID"] == i]
    if grp["Purchasing_Power"].mean() > 75: 
        name = "Zengin"
    elif grp["Safety"].mean() < 55: 
        name = "Riskli Bölge"
    elif grp["Purchasing_Power"].mean() < 45: 
        name = "Düşük Alım Gücü"
    else: 
        name = "Dengeli"
    cluster_names[i] = name

df["AI_Kategorisi"] = df["Cluster_ID"].map(cluster_names)

fig = px.scatter(df, x="Tahmin", y="Cost_of_Living", color="Fark", size="Purchasing_Power",
                 hover_name="City_Name",
                 hover_data={"Country":True, "Refah_Puani":":.2f", "AI_Kategorisi":True, "Fark":":.1f", "Tahmin":False, "Cost_of_Living":False},
                 color_continuous_scale=px.colors.diverging.RdYlGn_r, color_continuous_midpoint=0,
                 title=f"Global Yaşam Arbitrajı (Model Başarısı R²: %{r2_val*100:.1f})",
                 labels={
                     "Tahmin": "Modelin Olması Gereken Fiyat Tahmini",
                     "Cost_of_Living": "Gerçek Piyasa Fiyatı",
                     "Fark": "Fiyat Sapması",
                     "Purchasing_Power": "Alım Gücü"
                 },
                 height=700, template="plotly_white")

fig.add_shape(type="line", line=dict(dash="dash", color="gray"), 
              x0=df["Cost_of_Living"].min(), y0=df["Cost_of_Living"].min(),
              x1=df["Cost_of_Living"].max(), y1=df["Cost_of_Living"].max())

fig.update_layout(coloraxis_colorbar=dict(
    title="Maddi Durum",
    tickvals=[-15, 0, 15],
    ticktext=["Ucuz", "Adil Fiyat", "Pahalı"]
))

fig.show()

df_table = df[['City_Name', 'Country', 'AI_Kategorisi', 'Cost_of_Living', 'Tahmin', 'Fark', 'Refah_Puani', 'Safety']].copy()
df_table.columns = ['Şehir', 'Ülke', 'Kategori', 'Gerçek Fiyat', 'Model Tahmini', 'Fırsat Farkı', 'Refah Puanı', 'Güvenlik']
df_table = df_table.round(1)


print(df_table.sort_values(by='Fırsat Farkı').to_string())

NameError: name 'df' is not defined