# Predicion model optimization

## Imports

In [10]:
from pandas import DataFrame
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Data preprocessing

In [11]:
df = pd.read_csv('../data/DEvideos.csv',
    low_memory=False)

df['trending_date'] = df.apply(lambda row: datetime.strptime(row['trending_date'], '%y.%d.%m'), axis=1)
df['publish_time'] = df.apply(lambda row: datetime.strptime(row['publish_time'], '%Y-%m-%dT%H:%M:%S.000Z'), axis=1)
df['days_until_trending'] = df.apply(lambda row: ((row['trending_date'] - row['publish_time']).days + 1), axis=1)

df['tags_count'] = df.apply(lambda row: len(row['tags'].split('|')), axis=1)
df['publish_hour'] = df['publish_time'].map(lambda x: x.hour)
df['publish_month'] = df['publish_time'].map(lambda x: x.month)
df['publish_year'] = df['publish_time'].map(lambda x: x.year)
df['publish_day_of_month'] = df['publish_time'].map(lambda x: x.day)
df['publish_weekday'] = df['publish_time'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday

df['like_dislike_ratio'] = df.apply(lambda row: row['likes'] / (row['dislikes'] + 1), axis=1)
df['like_view_ratio'] = df.apply(lambda row: row['likes'] / (row['views'] + 1), axis=1)

df['ratings'] = df['likes'] + df['dislikes']
df['likes_per_rating'] = df.apply(lambda row: 0 if row['ratings'] == 0 else row['likes'] / row['ratings'], axis=1)
df['ratings_per_view'] = df['ratings'] / df['views']
df['comments_per_view'] = df['comment_count'] / df['views']

# Using int instead of cat
def assign_target_category(row):
    if row['days_until_trending'] == 0: 
        return 0
    elif row['days_until_trending'] == 1:
        return 1
    elif row['days_until_trending'] == 2:
        return 2
    elif row['days_until_trending'] <= 5:
        return 3
    else:
        return 6

df['target_category'] = df.apply(assign_target_category, axis=1)
df['channel_title'] = df['channel_title'].astype('category')

### Map tag factor

In [12]:
tag_df = pd.read_csv('../data/tags.csv')
tag_df = tag_df.set_index('tag')
def calculate_tag_factor(tag_string, tag_data):
    tag_list = pd.Series(list(set(map(lambda x: x.strip('\"').lower(), tag_string.split('|')))))
    return tag_list.apply(lambda tag: tag_data['factor'].get(tag, np.nan)).mean(skipna=True)
    
df['tag_factors'] = df['tags'].apply(lambda x: calculate_tag_factor(x, tag_df))
df['tag_factors'] = df.apply(lambda row: 0 if np.isnan(row['tag_factors']) else row['tag_factors'], axis=1)

### Remove unneccessary columns

In [13]:
N = len(df)
dropColumns = ['video_id', 'title', 'tags', 'thumbnail_link', 'description']
for column in df.columns:
    numberOfUniqueValues = df[column].nunique()
    if numberOfUniqueValues < 2:
        dropColumns.append(column)
    elif df[column].dtype == 'object' and numberOfUniqueValues > N * 0.9:
        dropColumns.append(column)
    elif df[column].isna().sum() / N > 0.95:
        dropColumns.append(column)
        
df.drop(columns=dropColumns, inplace=True)

## Encode features

In [14]:
x_df = DataFrame(index=df.index)
features = ['views', 'publish_hour', 'tag_factors']
for feature in features:
    feature_data = df[feature]
    if df[feature].dtype.name == 'category':
        x_label_encoder = preprocessing.LabelEncoder()
        x_label_encoder.fit(feature_data.astype(str))
        x_df[feature] = x_label_encoder.transform(feature_data)
    elif df[feature].dtype.name == 'datetime64[ns]':
        x_df[feature] = feature_data.to_seconds()
    elif df[feature].dtype.name == 'bool':
        x_df[feature] = int(feature_data)
    else:
        x_df[feature] = feature_data

x = np.reshape(x_df, (-1, len(x_df.columns)))

## Encode prediction target

In [15]:
target = df['target_category'].astype(str)
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(target)
y = y_label_encoder.transform(target)

## Calculate best prediction parameters

### XGBoost
estimated >24h

*Pratally executed*

In [34]:
%%time
xgb_model = XGBClassifier(use_label_encoder=False, verbosity=0)
optimization_dict = {'n_estimators': [10, 50, 100, 200],
                     'max_depth': [3, 6, 9],
                     'learning_rate': [0.1, 0.3, 0.5],
                     'booster': ['gbtree', 'gblinear', 'dart'],
                     'tree_method': ['exact', 'approx', 'hist'],
                     'gamma': [0, 2, 4],
                     'min_child_weight': [1, 2, 4],
                     'max_delta_step': [0, 2, 4, 8],
                     'subsample': [0.5, 0.75, 1],
                     'colsample_bytree': [0.5, 0.75, 1],
                     'colsample_bylevel': [0.5, 0.75, 1],
                     'colsample_bynode': [0.5, 0.75, 1],
                     'reg_alpha': [0, 1, 2],
                     'reg_lambda': [0, 1, 2]}

model = GridSearchCV(xgb_model, optimization_dict, scoring='accuracy')

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.6656953966699314
{'booster': 'gbtree', 'colsample_bylevel': 0.75, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 2, 'learning_rate': 0.5, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 0, 'subsample': 1, 'tree_method': 'hist'}
Wall time:       


### K Nearest Neighbors
~5min

In [7]:
%%time
knn_model = KNeighborsClassifier()
optimization_dict = {'n_neighbors': [2, 5, 8],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                    'leaf_size': [20, 30, 40],
                    'p': [1, 2, 4]}

model = GridSearchCV(knn_model, optimization_dict, scoring='accuracy', n_jobs=-1)

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.6014691478942213
{'algorithm': 'kd_tree', 'leaf_size': 20, 'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}
Wall time: 5min 47s


### Random Forest
~1.5h

In [8]:
%%time
rf_model = RandomForestClassifier()
optimization_dict = {'n_estimators': [10, 50, 100, 200],
'criterion': ['gini', 'entropy'],
'max_depth': [None, 3, 6, 9],
'min_samples_split': [1, 2, 4],
'min_samples_leaf': [1, 2],
'min_weight_fraction_leaf': [0.0, 0.2],
'max_features': [None, 'sqrt', 'log2'],
'max_leaf_nodes': [None, 50],
'min_impurity_decrease': [0.0, 0.2],
'class_weight': [None, 'balanced', 'balanced_subsample']}

model = GridSearchCV(rf_model, optimization_dict, scoring='accuracy', n_jobs=-1)

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.6588393731635651
{'class_weight': None, 'criterion': 'gini', 'max_depth': 9, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200}
Wall time: 1h 37min 53s


### Decision tree
~30sek

In [9]:
%%time
rf_model = DecisionTreeClassifier()
optimization_dict = {'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [None, 3, 6, 9],
'min_samples_split': [1, 2, 4],
'min_samples_leaf': [1, 2],
'min_weight_fraction_leaf': [0.0, 0.2],
'max_features': [None, 'sqrt', 'log2'],
'max_leaf_nodes': [None, 50],
'min_impurity_decrease': [0.0, 0.2],
'class_weight': [None, 'balanced']}

model = GridSearchCV(rf_model, optimization_dict, scoring='accuracy', n_jobs=-1)

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.6481880509304603
{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}
Wall time: 33.7 s
