In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv(r"D:\BigData And DataMining\Data Mining\TimeSeris\BTTH1\train.csv")


In [28]:
# Bước 2: Kalman Filter thủ công cho Listening_Time_minutes
listening_time = df['Listening_Time_minutes'].dropna().reset_index(drop=True)
n = len(listening_time)
Q = 1e-5
R = 1

xhat = np.zeros(n)
P = np.zeros(n)
xhatminus = np.zeros(n)
Pminus = np.zeros(n)
K = np.zeros(n)

xhat[0] = listening_time[0]
P[0] = 1.0

for k in range(1, n):
    xhatminus[k] = xhat[k-1]
    Pminus[k] = P[k-1] + Q

    K[k] = Pminus[k] / (Pminus[k] + R)
    xhat[k] = xhatminus[k] + K[k] * (listening_time[k] - xhatminus[k])
    P[k] = (1 - K[k]) * Pminus[k]


In [29]:
# Bước 3: Tạo DataFrame mới với dữ liệu đã lọc
filtered_df = df.loc[listening_time.index].copy()
filtered_df['Kalman_Listening_Time'] = xhat

# Bước 4: Xác định biến đầu vào và đầu ra
features = ['Episode_Length_minutes', 'Host_Popularity_percentage',
            'Guest_Popularity_percentage', 'Number_of_Ads',
            'Publication_Time', 'Episode_Sentiment']
target = 'Kalman_Listening_Time'

# Xóa các hàng có giá trị thiếu
filtered_df = filtered_df.dropna(subset=features)

X = filtered_df[features]
y = filtered_df[target]

In [30]:
# Bước 5: Tiền xử lý dữ liệu
categorical_features = ['Publication_Time', 'Episode_Sentiment']
numeric_features = list(set(features) - set(categorical_features))

preprocessor = ColumnTransformer([
    ('num', 'passthrough', numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Bước 6: Tách train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bước 7: Khởi tạo và huấn luyện các mô hình
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(50, 30), max_iter=500, random_state=42)
}


In [31]:
# Bước 8: Đánh giá mô hình
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"🔹 {name}")
    print(f"   RMSE: {rmse:.2f}")
    print(f"   R² Score: {r2:.3f}\n")


🔹 Linear Regression
   RMSE: 1.10
   R² Score: 0.005

🔹 Random Forest
   RMSE: 1.15
   R² Score: -0.072

🔹 Neural Network
   RMSE: 1.19
   R² Score: -0.168

