In [5]:
!pip install openpyxl



In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing   import MinMaxScaler
from sklearn.ensemble        import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics         import roc_auc_score
import os

# 1) Veriyi yükle
risk = pd.read_excel(os.path.expanduser("~/risk_nogeo_export.xlsx"))

# 2) slip_mm eksiklerini medyan+flag ile doldur
risk['slip_missing'] = risk['slip_mm'].isna().astype(int)
risk['slip_mm'] = risk['slip_mm'].fillna(
    risk.groupby('age_ord')['slip_mm'].transform('median')
)

# 3) 0–1 ölçekleme
scaler = MinMaxScaler()
for col in ['pop_density','dist_to_fault','slip_mm','fault_freq','range',
            'averageSignal','pop_dist']:
    risk[f"{col}_norm"] = scaler.fit_transform(risk[[col]])

# 4) Yeni bileşik risk skoru
risk['risk_score'] = (
      0.35 * risk['pop_density_norm']
    + 0.25 * (1 - risk['dist_to_fault_norm'])
    + 0.10 * risk['fault_freq_norm']
    + 0.10 * risk['class_bin']
    + 0.05 * risk['slip_mm_norm']
    + 0.05 * (1 - risk['averageSignal_norm'])
    + 0.05 * (1 - risk['range_norm'])
    + 0.05 * (1 - risk['pop_dist_norm'])
)

# 5) Isolation Forest
iso_feats = risk.drop(columns=['risk_score']+
    [c for c in risk.columns if c.endswith('_norm')]+['slip_mm','slip_missing'])
iso = IsolationForest(contamination=0.05, random_state=42, n_jobs=-1)
risk['iso_flag'] = (iso.fit_predict(iso_feats) == -1).astype(int)

# 6) Kritik istasyonları al
crit = risk[(risk.risk_score >= 0.80) | (risk.iso_flag==1)].copy()
crit_top = crit.sort_values(['risk_score','pop_density_norm'],
                            ascending=[False,False]).head(1000)
crit_top.to_csv("critical_stations.csv", index=False)
print("critical_stations.csv oluşturuldu – satır:", crit_top.shape[0])

# -----------------------------------------------------------
# 6) Hızlı özet
# -----------------------------------------------------------
print("\nRisk skor dağılımı:\n", risk['risk_score'].describe())
print("\nKritik istasyon sayısı:", crit.shape[0])
print("\nİlk 5 kritik satır:\n",
      crit_top[['cell','risk_score','pop_density','dist_to_fault','iso_flag']]
      .head())


# 7) Pseudo-label ROC-AUC
q70, q30 = risk['pop_density'].quantile([.70,.30])
pseudo = risk[
    ((risk.pop_density>=q70)&(risk.dist_to_fault<=1000)) |
    ((risk.pop_density<=q30)&(risk.dist_to_fault>=10000))
].copy()
pseudo['label'] = ((pseudo.pop_density>=q70)&(pseudo.dist_to_fault<=1000)).astype(int)

Xtr,Xte,ytr,yte = train_test_split(
    pseudo[['risk_score']], pseudo['label'],
    test_size=0.3, random_state=42, stratify=pseudo['label']
)
auc = roc_auc_score(yte, Xte['risk_score'])
print("Yeni pseudo-label ROC-AUC:", round(auc,3))


critical_stations.csv oluşturuldu – satır: 1000

Risk skor dağılımı:
 count    108195.000000
mean          0.476415
std           0.062308
min           0.262355
25%           0.439868
50%           0.471433
75%           0.517396
max           0.755363
Name: risk_score, dtype: float64

Kritik istasyon sayısı: 5410

İlk 5 kritik satır:
              cell  risk_score   pop_density  dist_to_fault  iso_flag
52666  7622111533    0.730383  41955.894531   1.579086e+04         1
52667  7622111534    0.729450  41955.894531   1.546641e+04         1
10761       13155    0.659052  47598.863281   2.836254e+06         1
11586       11122    0.635339  42984.562500   2.835633e+06         1
25804    48578308    0.617925  11763.995117   1.190658e+03         1
Yeni pseudo-label ROC-AUC: 0.895


In [8]:
# --------------------------------------------
# 1) Satır × sütun sayısı
# --------------------------------------------
print("Satır × Sütun:", risk.shape)

# --------------------------------------------
# 2) Veri tipleri + eksik değer sayıları
# --------------------------------------------
print("\n--- risk_nogeo.info() ---")
risk.info(show_counts=True)

# --------------------------------------------
# 3) Sayısal sütunların istatistikleri
# --------------------------------------------
print("\n--- Sayısal Sütun İstatistikleri ---")
print(risk.describe())

# --------------------------------------------
# 4) Object-tipi sütun kaldı mı?
# --------------------------------------------
obj_cols = risk.select_dtypes(include="object").columns
if obj_cols.any():
    print("\n--- Object Sütun İstatistikleri ---")
    print(risk[obj_cols].describe())
else:
    print("\nObject-tipi sütun bulunmuyor – tüm değişkenler sayısal / bool / int.")


Satır × Sütun: (108195, 45)

--- risk_nogeo.info() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108195 entries, 0 to 108194
Data columns (total 45 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   mcc                 108195 non-null  int64  
 1   net                 108195 non-null  int64  
 2   area                108195 non-null  int64  
 3   cell                108195 non-null  int64  
 4   unit                108195 non-null  int64  
 5   lon                 108195 non-null  float64
 6   lat                 108195 non-null  float64
 7   range               108195 non-null  int64  
 8   samples             108195 non-null  int64  
 9   changeable          108195 non-null  int64  
 10  created             108195 non-null  int64  
 11  updated             108195 non-null  int64  
 12  averageSignal       108195 non-null  float64
 13  dist_to_fault       108195 non-null  float64
 14  index_right         108195 no