In [None]:
# =========================
# 0) Kurulum + Drive
# =========================
!pip -q install -U joblib scikit-learn

from google.colab import drive
drive.mount("/content/gdrive")

import os
import numpy as np
import pandas as pd
import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error


# =========================
# 1) Yollar
# =========================
BASE_DIR = "/content/gdrive/MyDrive/Veri_bilimine_giris2"
CSV_PATH = os.path.join(BASE_DIR, "steam.csv")
MODEL_PATH = os.path.join(BASE_DIR, "steam_owners_model.joblib")

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"steam.csv bulunamadı: {CSV_PATH}")

# =========================
# 2) Pickle-safe yardımcılar
# =========================
def to_text(col):
    return col.iloc[:, 0].fillna("").astype(str)

def to_semicolon_text(col):
    return col.iloc[:, 0].fillna("").astype(str).str.replace(";", " ", regex=False)

def build_semicolon_bow():
    return Pipeline(steps=[
        ("to_text", FunctionTransformer(to_semicolon_text, validate=False)),
        ("cv", CountVectorizer(binary=True, token_pattern=r"[^ ]+"))
    ])

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    """
    Kategori -> hedef ortalaması (log hedef üzerinde) ile encode eder.
    smoothing büyüdükçe global ortalamaya daha çok yaklaşır.
    """
    def __init__(self, smoothing=20.0):
        self.smoothing = float(smoothing)
        self.global_mean_ = None
        self.maps_ = None
        self.cols_ = None

    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        self.cols_ = list(X.columns)
        y = np.asarray(y, dtype=float)
        self.global_mean_ = float(np.mean(y))
        self.maps_ = {}

        for c in self.cols_:
            s = X[c].fillna("__MISSING__").astype(str)
            stats = pd.DataFrame({"key": s, "y": y}).groupby("key")["y"].agg(["mean", "count"])
            enc = (stats["mean"] * stats["count"] + self.global_mean_ * self.smoothing) / (stats["count"] + self.smoothing)
            self.maps_[c] = enc.to_dict()

        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        out = []
        for c in self.cols_:
            s = X[c].fillna("__MISSING__").astype(str)
            m = self.maps_.get(c, {})
            v = s.map(m).fillna(self.global_mean_).astype(float).to_numpy().reshape(-1, 1)
            out.append(v)
        return np.hstack(out)

# =========================
# 3) Veri oku + target
# =========================
df = pd.read_csv(CSV_PATH)

owners = df["owners"].astype(str).str.replace(",", "", regex=False)
low = owners.str.split("-").str[0].astype(int)
high = owners.str.split("-").str[1].astype(int)
df["owners_mid"] = (low + high) / 2.0

y = np.log1p(df["owners_mid"].values)  # log hedef

# =========================
# 4) Feature set (pre_release)
# =========================
MODE = "pre_release"

feature_cols = [
    "name", "release_date", "english",
    "developer", "publisher",
    "platforms", "required_age",
    "categories", "genres", "steamspy_tags",
    "price"
]

for c in feature_cols:
    if c not in df.columns:
        df[c] = np.nan

X = df[feature_cols].copy()

# release_date -> release_year
year = X["release_date"].astype(str).str.extract(r"(\d{4})")[0]
X["release_year"] = pd.to_numeric(year, errors="coerce")
X = X.drop(columns=["release_date"])

# Sayısal feature engineering
X["price"] = pd.to_numeric(X["price"], errors="coerce")
X["log_price"] = np.log1p(X["price"].fillna(0))

# kolon grupları
num_cols = ["english", "required_age", "price", "log_price", "release_year"]
onehot_cols = ["platforms"]                 # az kategorili
tmean_cols = ["developer", "publisher"]     # çok kategorili -> mean encoding

# =========================
# 5) Preprocess + model
# =========================
preprocess = ColumnTransformer(
    transformers=[
        ("name", Pipeline(steps=[
            ("to_text", FunctionTransformer(to_text, validate=False)),
            ("tfidf", TfidfVectorizer(lowercase=True, ngram_range=(1,2), max_features=8000))
        ]), ["name"]),

        ("genres", build_semicolon_bow(), ["genres"]),
        ("categories", build_semicolon_bow(), ["categories"]),
        ("tags", build_semicolon_bow(), ["steamspy_tags"]),

        ("tmean", TargetMeanEncoder(smoothing=30.0), tmean_cols),

        ("onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),

        ("nums", "passthrough", num_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", Ridge(alpha=2.0, random_state=42))
])

# =========================
# 6) Eğitim / Değerlendirme
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipe.fit(X_train, y_train)

pred_log = pipe.predict(X_test)
pred = np.expm1(pred_log)
true = np.expm1(y_test)

mae = mean_absolute_error(true, pred)
rmse = np.sqrt(mean_squared_error(true, pred))

# log ölçekte hata (aralık için)
resid_log = (pred_log - y_test)
std_log = float(np.std(resid_log))

print("MODE:", MODE)
print("MAE (owners_mid):", round(mae, 2))
print("RMSE (owners_mid):", round(rmse, 2))
print("STD (log hata):", round(std_log, 4))

# =========================
# 7) Kaydet
# =========================
bundle = {
    "pipeline": pipe,
    "mode": MODE,
    "std_log": std_log,
    "feature_cols": list(X.columns)  # release_year + log_price dahil
}
joblib.dump(bundle, MODEL_PATH)
print("Model kaydedildi:", MODEL_PATH)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
MODE: pre_release
MAE (owners_mid): 329653.9
RMSE (owners_mid): 10234780.34
STD (log hata): 0.8628
Model kaydedildi: /content/gdrive/MyDrive/Veri_bilimine_giris2/steam_owners_model.joblib


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

import os
import joblib
import pandas as pd
import numpy as np

BASE_DIR = "/content/gdrive/MyDrive/Veri_bilimine_giris2"
MODEL_PATH = os.path.join(BASE_DIR, "steam_owners_model.joblib")

bundle = joblib.load(MODEL_PATH)
pipe = bundle["pipeline"]
MODE = bundle["mode"]
STD_LOG = bundle.get("std_log", 0.8)
feature_cols = bundle["feature_cols"]

print("Model yüklendi. MODE =", MODE)

def ask_str(prompt, default=""):
    s = input(f"{prompt} (varsayılan: {default}): ").strip()
    return s if s else default

def ask_int(prompt, default=0):
    s = input(f"{prompt} (varsayılan: {default}): ").strip()
    if not s:
        return int(default)
    try:
        return int(s)
    except:
        print("Hatalı giriş -> varsayılan kullanıldı.")
        return int(default)

def ask_float(prompt, default=0.0):
    s = input(f"{prompt} (varsayılan: {default}): ").strip().replace(",", ".")
    if not s:
        return float(default)
    try:
        return float(s)
    except:
        print("Hatalı giriş -> varsayılan kullanıldı.")
        return float(default)

def build_input_df():
    sample = {}
    sample["name"] = ask_str("Oyun adı", "My Game")
    sample["release_year"] = ask_int("Çıkış yılı (YYYY)", 2026)
    sample["english"] = ask_int("English (1/0)", 1)
    sample["developer"] = ask_str("Developer", "UnknownDev")
    sample["publisher"] = ask_str("Publisher", "UnknownPub")
    sample["platforms"] = ask_str("Platforms (windows;mac;linux)", "windows")
    sample["required_age"] = ask_int("Required age", 0)
    sample["categories"] = ask_str("Categories (; ile) ör: Single-player;Steam Achievements", "Single-player")
    sample["genres"] = ask_str("Genres (; ile) ör: Action;Indie", "Indie")
    sample["steamspy_tags"] = ask_str("SteamSpy tags (; ile) ör: Action;Indie;Pixel Graphics", "Indie")
    sample["price"] = ask_float("Price (USD)", 9.99)

    # engineered
    sample["log_price"] = float(np.log1p(max(sample["price"], 0.0)))

    # eksikleri tamamla
    for c in feature_cols:
        if c not in sample:
            sample[c] = np.nan

    X_new = pd.DataFrame([{c: sample[c] for c in feature_cols}])
    return X_new, sample

def predict_with_range(X_new, std_log=0.8):
    pred_log = float(pipe.predict(X_new)[0])
    pred_mid = float(np.expm1(pred_log))

    # Kabaca 95% aralık (log-normal varsayımı): exp(±1.96*std)
    z = 1.96
    lo = float(np.expm1(pred_log - z * std_log))
    hi = float(np.expm1(pred_log + z * std_log))

    lo = max(0.0, lo)
    hi = max(lo, hi)
    return pred_mid, lo, hi

X_new, raw = build_input_df()
pred_mid, lo, hi = predict_with_range(X_new, STD_LOG)

print("\n--- GİRİLEN VERİ ÖZETİ ---")
for k, v in raw.items():
    print(f"{k}: {v}")

print("\n--- TAHMİN ---")
print("Tahmini owners_mid:", f"{pred_mid:,.0f}".replace(",", "."))
print("Yaklaşık aralık (95%):", f"{lo:,.0f}".replace(",", "."), "-", f"{hi:,.0f}".replace(",", "."))


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Model yüklendi. MODE = pre_release
Oyun adı (varsayılan: My Game): Half-Life 3
Çıkış yılı (YYYY) (varsayılan: 2026): 2026
English (1/0) (varsayılan: 1): 1
Developer (varsayılan: UnknownDev): Valve
Publisher (varsayılan: UnknownPub): Valve
Platforms (windows;mac;linux) (varsayılan: windows): windows;linux
Required age (varsayılan: 0): 13
Categories (; ile) ör: Single-player;Steam Achievements (varsayılan: Single-player): Single-player;Steam Achievements
Genres (; ile) ör: Action;Indie (varsayılan: Indie): Action;RPG
SteamSpy tags (; ile) ör: Action;Indie;Pixel Graphics (varsayılan: Indie): Action;RPG
Price (USD) (varsayılan: 9.99): 69.99

--- GİRİLEN VERİ ÖZETİ ---
name: Half-Life 3
release_year: 2026
english: 1
developer: Valve
publisher: Valve
platforms: windows;linux
required_age: 13
categories: Single-player;Steam Achievements
genres: Action;RPG
steamspy