### NOT:
**Bu model daha geliştirilme aşamasında. Finale kadar son hali yüklenecektir.**

## Gerekli Kütüphaneler

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score

## publishedAt Feature Özellik Çıkarımı

In [2]:
def extract_time_features(df):
    df = df.copy()
    df["publishedAt"] = pd.to_datetime(df["publishedAt"])

    df["publish_hour"] = df["publishedAt"].dt.hour
    df["publish_day_of_week"] = df["publishedAt"].dt.dayofweek
    df["is_weekend"] = df["publish_day_of_week"].isin([5, 6]).astype(int)

    return df


## Title Metadata

In [3]:
def title_meta_features(series):
    return np.c_[
        series.str.len().fillna(0),
        series.str.split().apply(len).fillna(0),
        series.str.contains("!").astype(int),
        series.str.contains(r"\?").astype(int),
    ]


## TF-IDF pipeline

In [4]:
text_pipeline = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        max_features=8000,
        ngram_range=(1, 2),
        stop_words="english"
    ))
])

title_meta_pipeline = Pipeline(steps=[
    ("meta", FunctionTransformer(title_meta_features, validate=False))
])

## Dataset Okuma

In [5]:
df = pd.read_csv('../../../Dataset/Final_Dataset.csv')
df = extract_time_features(df)

  df = pd.read_csv('../../../Dataset/Final_Dataset.csv')


In [6]:
df = df.sort_values("publishedAt").reset_index(drop=True)

## Train-Test Split

In [7]:
split_idx = int(len(df) * 0.8)

train_df = df.iloc[:split_idx]
test_df  = df.iloc[split_idx:]

In [8]:
X_train = train_df.drop(columns=["is_trending", "publishedAt"])
y_train = train_df["is_trending"]

X_test = test_df.drop(columns=["is_trending", "publishedAt"])
y_test = test_df["is_trending"]

## Model İçin Veri Hazırlama

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("title_tfidf", text_pipeline, "title"),
        ("title_meta", title_meta_pipeline, "title"),
        ("time_features", "passthrough",
         ["publish_hour", "publish_day_of_week", "is_weekend"]),
        ("category", OneHotEncoder(handle_unknown="ignore"), ["categoryId"]),
        ("comments_disabled", "passthrough", ["comments_disabled"]),
    ],
    remainder="drop"
)


## LightGBM Modeli

In [10]:
lgbm_model = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    is_unbalance=True,   # veri dengesizliği için
    random_state=42,
    n_jobs=-1
)


In [11]:
model_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", lgbm_model)
])

## Model Eğitme

In [12]:
model_pipeline.fit(X_train, y_train)
joblib.dump(model_pipeline, "model_pipeline.pkl")

[LightGBM] [Info] Number of positive: 167238, number of negative: 144796
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.722178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247917
[LightGBM] [Info] Number of data points in the train set: 312034, number of used features: 8024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.535961 -> initscore=0.144092
[LightGBM] [Info] Start training from score 0.144092


['model_pipeline.pkl']

## Sonuç Metrikleri

In [13]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score
)

y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")
print(f"PR-AUC   : {pr_auc:.4f}")



Accuracy : 0.8564
Precision: 0.1067
Recall   : 0.2062
F1-score : 0.1406
ROC-AUC  : 0.5850
PR-AUC   : 0.0795


## Örnek Test Kodu

In [14]:
# Bilgi girerek test etme
single_video = {
    "title": "faw",
    "categoryId": 1,
    "comments_disabled": 0,

    "publish_hour": 0,
    "publish_day_of_week": 6,
    "is_weekend": 1
}

single_video_df = pd.DataFrame([single_video])
trend_probability = model_pipeline.predict_proba(single_video_df)[0, 1]

print(f"Trending probability: %{trend_probability * 100:.2f}")

Trending probability: %73.20




## Not:
Ayrıca Modelin testi için *`GUI.py`* arayüzü kullanılabilir.