# Predicion model optimization

## Imports

In [1]:
from pandas import DataFrame
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Data preprocessing

In [2]:
df = pd.read_csv('/data/DEvideos.csv',
    low_memory=False)

df['trending_date'] = df.apply(lambda row: datetime.strptime(row['trending_date'], '%y.%d.%m'), axis=1)
df['publish_time'] = df.apply(lambda row: datetime.strptime(row['publish_time'], '%Y-%m-%dT%H:%M:%S.000Z'), axis=1)
df['days_until_trending'] = df.apply(lambda row: ((row['trending_date'] - row['publish_time']).days + 1), axis=1)

df['tags_count'] = df.apply(lambda row: len(row['tags'].split('|')), axis=1)
df['publish_hour'] = df['publish_time'].map(lambda x: x.hour)
df['publish_month'] = df['publish_time'].map(lambda x: x.month)
df['publish_year'] = df['publish_time'].map(lambda x: x.year)
df['publish_day_of_month'] = df['publish_time'].map(lambda x: x.day)
df['publish_weekday'] = df['publish_time'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday
df['trending_weekday'] = df['trending_date'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday

df['like_dislike_ratio'] = df.apply(lambda row: row['likes'] / (row['dislikes'] + 1), axis=1)
df['like_view_ratio'] = df.apply(lambda row: row['likes'] / (row['views'] + 1), axis=1)

df['ratings'] = df['likes'] + df['dislikes']
df['likes_per_rating'] = df['likes'] / df['ratings']
df['ratings_per_view'] = df['ratings'] / df['views']
df['comments_per_view'] = df['comment_count'] / df['views']

# Using int instead of cat
def assign_target_category(row):
    if row['days_until_trending'] == 0: 
        return 0
    elif row['days_until_trending'] == 1:
        return 1
    elif row['days_until_trending'] == 2:
        return 2
    elif row['days_until_trending'] <= 5:
        return 3
    else:
        return 6

df['target_category'] = df.apply(assign_target_category, axis=1)

N = len(df)
dropColumns = ['video_id', 'title', 'tags', 'thumbnail_link', 'description']
for column in df.columns:
    numberOfUniqueValues = df[column].nunique()
    if numberOfUniqueValues < 2:
        dropColumns.append(column)
    elif df[column].dtype == 'object' and numberOfUniqueValues > N * 0.9:
        dropColumns.append(column)
    elif df[column].isna().sum() / N > 0.95:
        dropColumns.append(column)
        
df.drop(columns=dropColumns, inplace=True)

df['channel_title'] = df['channel_title'].astype('category')

## Encode features

In [3]:
x_df = DataFrame(index=df.index)
features = ['publish_weekday', 'publish_month']
for feature in features:
    feature_data = df[feature]
    if df[feature].dtype.name == 'category':
        x_label_encoder = preprocessing.LabelEncoder()
        x_label_encoder.fit(feature_data.astype(str))
        x_df[feature] = x_label_encoder.transform(feature_data)
    elif df[feature].dtype.name == 'datetime64[ns]':
        x_df[feature] = feature_data.to_seconds()
    elif df[feature].dtype.name == 'bool':
        x_df[feature] = int(feature_data)
    else:
        x_df[feature] = feature_data

x = np.reshape(x_df, (-1, len(x_df.columns)))

## Encode prediction target

In [4]:
target = df['target_category'].astype(str)
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(target)
y = y_label_encoder.transform(target)

## Calculate best prediction parameters

### XGBoost

In [5]:
%%time
xgb_model = XGBClassifier(use_label_encoder=False)
optimization_dict = {'max_depth': [2,4,8,16,32,64,None],
                     'n_estimators': [10,20,40,80,160,320]}

model = GridSearchCV(xgb_model, optimization_dict, 
                     scoring='accuracy')

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

### K Nearest Neighbors

In [6]:
%%time
knn_model = KNeighborsClassifier()
optimization_dict = {'n_neighbors': [2,4,8,16,32,64,128,256,512,1024,2048]}

model = GridSearchCV(knn_model, optimization_dict, 
                     scoring='accuracy')

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.6237512242899118
{'n_neighbors': 2048}
CPU times: user 1min 41s, sys: 1.61 s, total: 1min 43s
Wall time: 1min 43s


### Random Forest

In [7]:
%%time
rf_model = RandomForestClassifier()
optimization_dict = {'max_depth': [2,4,8,16,32,64,None],
                     'n_estimators': [10,20,40,80,160,320],
                     'min_samples_split': [5, 10]}

model = GridSearchCV(rf_model, optimization_dict, 
                     scoring='accuracy')

model.fit(x,y)
print(model.best_score_)
print(model.best_params_)

0.5701028403525955
{'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 10}
CPU times: user 5min 46s, sys: 667 ms, total: 5min 47s
Wall time: 5min 47s
