In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('hwdfnew.csv')

In [4]:
df.columns

Index(['trending_date', 'category_id', 'views', 'likes', 'dislikes',
       'comment_count', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'no_tags', 'desc_len', 'len_title',
       'log_category_id', 'log_views', 'log_likes', 'log_dislikes',
       'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title'],
      dtype='object')

## Pembuatan Fungsi

### Function: Splitting

In [5]:
def splitting_a(X, y):
    
    # Split data train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standarisasi data
    ss = StandardScaler()
    
    # Fitur untuk distandarisasi
    columns_to_standardize = ['log_category_id', 'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']

    # Standarisasi X_train
    X_train[columns_to_standardize] = ss.fit_transform(X_train[columns_to_standardize])

    # Standarisasi X_test
    X_test[columns_to_standardize] = ss.transform(X_test[columns_to_standardize])
    
    return X_train, X_test, y_train, y_test

In [6]:
def splitting_b(X, y):
    
    # Standarisasi data
    ss = StandardScaler()
    
    # Fitur untuk distandarisasi
    columns_to_standardize = ['log_category_id', 'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']
    
    # Standarisasi X
    X[columns_to_standardize] = ss.fit_transform(X[columns_to_standardize])
    
    # Split data train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    return X_train, X_test, y_train, y_test

### Function: Modelling

In [7]:
def modelling(X_train, X_test, y_train, y_test):
    
    # Inisialisasi model
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(),
        'XG Boosting': xgb.XGBRegressor()
    }
    
    # Melatih dan menguji setiap model
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        mae_train = mean_absolute_error(y_train, y_train_pred).round(4)
        mae_test = mean_absolute_error(y_test, y_test_pred).round(4)
        
        rmse_train = mean_squared_error(y_train, y_train_pred, squared=False).round(4)
        rmse_test = mean_squared_error(y_test, y_test_pred, squared=False).round(4)
        
        r2_train = r2_score(y_train, y_train_pred).round(4)
        r2_test = r2_score(y_test, y_test_pred).round(4)
        
        # Menampilkan score
        results[model_name] = {
            'MAE Train': mae_train,
            'MAE Test': mae_test,
            'RMSE Train': rmse_train,
            'RMSE Test': rmse_test,
            'R2 Train': r2_train,
            'R2 Test': r2_test
        }
    
    return results

### Function: Feature Importance

In [26]:
def get_feature(X_train, X_test, y_train, y_test, model_name):
    
    # Inisialisasi model
    if model_name == 'RF':
        model = RandomForestRegressor()
    elif model_name == 'XGB':
        model = xgb.XGBRegressor()
    else:
        raise ValueError("Model name not supported.")
    
    # Melatih model
    model.fit(X_train, y_train)
    
    # Mendapatkan skor fitur
    feature_importance = model.feature_importances_

    # Nama fitur dari X_train
    feature_names = X_train.columns

    # Membuat DataFrame untuk memudahkan analisis
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

    # Mengurutkan fitur
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Menampilkan fitur
    print(feature_importance_df)

### Function: Hyperparameters Tuning

In [9]:
def hypertune(X_train, X_test, y_train, y_test, model_name, hyperparameters=None):
   
    # Inisialisasi model
    if model_name == 'RF':
        model = RandomForestRegressor()
    elif model_name == 'XGB':
        model = xgb.XGBRegressor()
    else:
        raise ValueError("Model name not supported.")
    
    # Menentukan hyperparameter yang akan ditune (jika tidak ada, gunakan default)
    if hyperparameters is None:
        hyperparameters = {}
    
    # Tuning hyperparameter menggunakan Grid Search
    grid_search = GridSearchCV(model, hyperparameters, cv=5)
    grid_search.fit(X_train, y_train)
    
    # Mendapatkan model terbaik setelah tuning
    best_model = grid_search.best_estimator_
    
    # Evaluasi model terbaik pada data test
    test_score = best_model.score(X_test, y_test)
    
    # Prediksi nilai target pada data test menggunakan model terbaik
    y_pred = best_model.predict(X_test)
    
    # Menghitung MAE, RMSE, dan R-squared pada data test
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    return best_model, grid_search.best_params_, test_score, mae, rmse, r2

# Case 1:

- Normalisasi dilakukan setelah split data
- Tanpa mengikutsertakan fitur bernilai boolean

In [10]:
dfselect1 = df[['log_category_id', 'log_views', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 
                'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

### X and y Splitting

In [11]:
X= dfselect1[['log_category_id', 'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

y= dfselect1['log_views']

In [12]:
# Memanggil fungsi splitting_a

X_train, X_test, y_train, y_test = splitting_a(X, y)

### Model Training & Evaluation

In [13]:
# Memanggil fungsi modelling

modelling(X_train, X_test, y_train, y_test)

{'Linear Regression': {'MAE Train': 0.5978,
  'MAE Test': 0.6039,
  'RMSE Train': 0.8011,
  'RMSE Test': 0.8159,
  'R2 Train': 0.6644,
  'R2 Test': 0.6585},
 'Random Forest': {'MAE Train': 0.1413,
  'MAE Test': 0.3781,
  'RMSE Train': 0.1954,
  'RMSE Test': 0.5151,
  'R2 Train': 0.98,
  'R2 Test': 0.8639},
 'XG Boosting': {'MAE Train': 0.3065,
  'MAE Test': 0.3793,
  'RMSE Train': 0.4108,
  'RMSE Test': 0.5058,
  'R2 Train': 0.9117,
  'R2 Test': 0.8687}}

### Feature Importance

In [74]:
# Memanggil fungsi get_feature

get_feature(X_train, X_test, y_train, y_test, 'RF')

             Feature  Importance
1       log_dislikes    0.774577
2  log_comment_count    0.060647
4       log_desc_len    0.049552
0        category_id    0.040840
5      log_len_title    0.038669
3        log_no_tags    0.035716


In [None]:
get_feature(X_train, X_test, y_train, y_test, 'XGB')

### Hyperparameters Tunning

In [78]:
param = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
# Memanggil fungsi 
hypertune(X_train, X_test, y_train, y_test, 'RF', param)

---

# Case 2

- Normalisasi dilakukan setelah split data
- Dengan mengikutsertakan fitur bernilai boolean

In [14]:
dfselect2 = df[['log_category_id', 'log_views', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 
               'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

### X and y Splitting

In [15]:
X= dfselect2.drop(columns= ['log_views'])
y= dfselect2['log_views']

In [16]:
# Memanggil fungsi splitting_a

X_train, X_test, y_train, y_test = splitting_a(X, y)

### Model Training & Evaluation

In [17]:
# Memanggil fungsi modelling

modelling(X_train, X_test, y_train, y_test)

{'Linear Regression': {'MAE Train': 0.5287,
  'MAE Test': 0.5296,
  'RMSE Train': 0.6729,
  'RMSE Test': 0.677,
  'R2 Train': 0.7632,
  'R2 Test': 0.7649},
 'Random Forest': {'MAE Train': 0.1414,
  'MAE Test': 0.3786,
  'RMSE Train': 0.1952,
  'RMSE Test': 0.5155,
  'R2 Train': 0.9801,
  'R2 Test': 0.8637},
 'XG Boosting': {'MAE Train': 0.3054,
  'MAE Test': 0.3787,
  'RMSE Train': 0.4078,
  'RMSE Test': 0.5055,
  'R2 Train': 0.913,
  'R2 Test': 0.8689}}

### Feature Importance

In [None]:
# Memanggil fungsi get_feature

get_feature(X_train, X_test, y_train, y_test, 'RF')

In [None]:
get_feature(X_train, X_test, y_train, y_test, 'XGB')

### Hyperparameters Tunning

In [None]:
# Memanggil fungsi

hypertune(X_train, X_test, y_train, y_test, 'RF', param)

---

# Case 3

- Normalisasi dilakukan sebelum split data
- Tanpa mengikutsertakan fitur bernilai boolean

In [18]:
dfselect3 = df[['log_category_id', 'log_views', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 
                'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

### X and y Splitting

In [19]:
X= dfselect3[['log_category_id', 'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

y= dfselect3['log_views']

In [20]:
X_train, X_test, y_train, y_test = splitting_b(X, y)

### Model Training & Evaluation

In [21]:
# Memanggil fungsi modelling

modelling(X_train, X_test, y_train, y_test)

{'Linear Regression': {'MAE Train': 0.5978,
  'MAE Test': 0.6039,
  'RMSE Train': 0.8011,
  'RMSE Test': 0.8159,
  'R2 Train': 0.6644,
  'R2 Test': 0.6585},
 'Random Forest': {'MAE Train': 0.1418,
  'MAE Test': 0.3776,
  'RMSE Train': 0.196,
  'RMSE Test': 0.515,
  'R2 Train': 0.9799,
  'R2 Test': 0.8639},
 'XG Boosting': {'MAE Train': 0.3065,
  'MAE Test': 0.3793,
  'RMSE Train': 0.4108,
  'RMSE Test': 0.5058,
  'R2 Train': 0.9117,
  'R2 Test': 0.8687}}

### Feature Importance

In [None]:
# Memanggil fungsi get_feature

get_feature(X_train, X_test, y_train, y_test, 'RF')

In [None]:
get_feature(X_train, X_test, y_train, y_test, 'XGB')

### Hyperparameters Tunning

In [None]:
# Memanggil fungsi

hypertune(X_train, X_test, y_train, y_test, 'RF', param)

---

# Case 4
- Normalisasi dilakukan sebelum split data
- Dengan mengikutsertakan fitur bernilai boolean

In [22]:
dfselect4 = df[['log_category_id', 'log_views', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 
                'log_dislikes', 'log_comment_count', 'log_no_tags', 'log_desc_len', 'log_len_title']]

### X and y Splitting

In [23]:
X = dfselect4.drop(columns= ['log_views'])

y= dfselect4['log_views']

In [24]:
X_train, X_test, y_train, y_test = splitting_b(X, y)

### Model Training & Evaluation

In [25]:
modelling(X_train, X_test, y_train, y_test)

{'Linear Regression': {'MAE Train': 0.5287,
  'MAE Test': 0.5296,
  'RMSE Train': 0.6729,
  'RMSE Test': 0.677,
  'R2 Train': 0.7632,
  'R2 Test': 0.7649},
 'Random Forest': {'MAE Train': 0.1417,
  'MAE Test': 0.3785,
  'RMSE Train': 0.1954,
  'RMSE Test': 0.5156,
  'R2 Train': 0.98,
  'R2 Test': 0.8636},
 'XG Boosting': {'MAE Train': 0.3054,
  'MAE Test': 0.3787,
  'RMSE Train': 0.4078,
  'RMSE Test': 0.5055,
  'R2 Train': 0.913,
  'R2 Test': 0.8689}}

### Feature Importance

In [None]:
# Memanggil fungsi get_feature

get_feature(X_train, X_test, y_train, y_test, 'RF')

In [None]:
get_feature(X_train, X_test, y_train, y_test, 'XGB')

### Hyperparameters Tunning

In [None]:
# Memanggil fungsi hypertune

hypertune(X_train, X_test, y_train, y_test, 'RF', param)