# Data prediction

## Imports

In [183]:
from pandas import DataFrame
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Data preprocessing

In [184]:
df = pd.read_csv('../0_data/DEvideos.csv',
    low_memory=False)

df['trending_date'] = df.apply(lambda row: datetime.strptime(row['trending_date'], '%y.%d.%m'), axis=1)
df['publish_time'] = df.apply(lambda row: datetime.strptime(row['publish_time'], '%Y-%m-%dT%H:%M:%S.000Z'), axis=1)
df['days_until_trending'] = df.apply(lambda row: ((row['trending_date'] - row['publish_time']).days + 1), axis=1)

df['tags_count'] = df.apply(lambda row: len(row['tags'].split('|')), axis=1)
df['publish_hour'] = df['publish_time'].map(lambda x: x.hour)
df['publish_month'] = df['publish_time'].map(lambda x: x.month)
df['publish_year'] = df['publish_time'].map(lambda x: x.year)
df['publish_day_of_month'] = df['publish_time'].map(lambda x: x.day)
df['publish_weekday'] = df['publish_time'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday

df['like_dislike_ratio'] = df.apply(lambda row: row['likes'] / (row['dislikes'] + 1), axis=1)
df['like_view_ratio'] = df.apply(lambda row: row['likes'] / (row['views'] + 1), axis=1)

df['ratings'] = df['likes'] + df['dislikes']
df['likes_per_rating'] = df.apply(lambda row: 0 if row['ratings'] == 0 else row['likes'] / row['ratings'], axis=1)
df['ratings_per_view'] = df['ratings'] / df['views']
df['comments_per_view'] = df['comment_count'] / df['views']

# Using int instead of cat
def assign_target_category(row):
    if row['days_until_trending'] == 0: 
        return 0
    elif row['days_until_trending'] == 1:
        return 1
    elif row['days_until_trending'] == 2:
        return 2
    elif row['days_until_trending'] <= 5:
        return 3
    else:
        return 6

df['target_category'] = df.apply(assign_target_category, axis=1)
df['channel_title'] = df['channel_title'].astype('category')

### Map tag factor

In [185]:
tag_df = pd.read_csv('../0_data/tags.csv')
tag_df = tag_df.set_index('tag')
def calculate_tag_factor(tag_string, tag_data):
    tag_list = pd.Series(list(set(map(lambda x: x.strip('\"').lower(), tag_string.split('|')))))
    return tag_list.apply(lambda tag: tag_data['factor'].get(tag, np.nan)).mean(skipna=True)
    
df['tag_factors'] = df['tags'].apply(lambda x: calculate_tag_factor(x, tag_df))
df['tag_factors'] = df.apply(lambda row: 0 if np.isnan(row['tag_factors']) else row['tag_factors'], axis=1)

### Remove unused columns

In [186]:
N = len(df)
dropColumns = ['video_id', 'title', 'tags', 'thumbnail_link', 'description']
for column in df.columns:
    numberOfUniqueValues = df[column].nunique()
    if numberOfUniqueValues < 2:
        dropColumns.append(column)
    elif df[column].dtype == 'object' and numberOfUniqueValues > N * 0.9:
        dropColumns.append(column)
    elif df[column].isna().sum() / N > 0.95:
        dropColumns.append(column)
        
df.drop(columns=dropColumns, inplace=True)

## Encode features

In [187]:
x_df = DataFrame(index=df.index)
features = ['views', 'publish_hour', 'ratings_per_view', 'comments_per_view', 'tag_factors']
for feature in features:
    feature_data = df[feature]
    if df[feature].dtype.name == 'category':
        x_label_encoder = preprocessing.LabelEncoder()
        x_label_encoder.fit(feature_data.astype(str))
        x_df[feature] = x_label_encoder.transform(feature_data)
    elif df[feature].dtype.name == 'datetime64[ns]':
        x_df[feature] = feature_data.to_seconds()
    elif df[feature].dtype.name == 'bool':
        x_df[feature] = int(feature_data)
    else:
        x_df[feature] = feature_data

x = np.reshape(x_df, (-1, len(x_df.columns)))

## Encode prediction target

In [188]:
target = df['target_category'].astype(str)
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(target)
y = y_label_encoder.transform(target)

## Create train and test datasubset

In [189]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=0)

## Declare classifiers

```
classifiers[:][0]: Name
classifiers[:][1]: Classifier object
classifiers[:][2]: Prediction
```

In [190]:
classifiers = []

classifiers.append(['Decision Tree', DecisionTreeClassifier()])
classifiers.append(['K Nearest Neighbor', KNeighborsClassifier(n_jobs=-1)])
classifiers.append(['Random Forest', RandomForestClassifier(n_jobs=-1)])
classifiers.append(['XG Boost', XGBClassifier(use_label_encoder=False, verbosity=0)])

classifiers.append(['Decision Tree (optimized)', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=50, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, splitter='best')])
classifiers.append(['K Nearest Neighbor (optimized)', KNeighborsClassifier(n_jobs=-1, algorithm='kd_tree', leaf_size=20, n_neighbors=8, p=1, weights='uniform')])
classifiers.append(['Random Forest (optimized)', RandomForestClassifier(n_jobs=-1, class_weight=None, criterion='gini', max_depth=9, max_features='log2', max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=200)])
classifiers.append(['XG Boost (optimized)', XGBClassifier(use_label_encoder=False, verbosity=0, booster='gbtree', colsample_bylevel=0.75, colsample_bynode=1, colsample_bytree=1, gamma=2, learning_rate=0.5, max_delta_step=0, max_depth=6, min_child_weight=1, n_estimators=100, reg_alpha=1, reg_lambda=0, subsample=1, tree_method='hist')])

## Train model

In [191]:
for classifier in classifiers:
    print('\nTraining - ' + classifier[0])
    %time classifier[1].fit(x_train, y_train)


Training - Decision Tree
Wall time: 142 ms

Training - K Nearest Neighbor
Wall time: 44 ms

Training - Random Forest
Wall time: 467 ms

Training - XG Boost
Wall time: 3.57 s

Training - Decision Tree (optimized)
Wall time: 70 ms

Training - K Nearest Neighbor (optimized)
Wall time: 41 ms

Training - Random Forest (optimized)
Wall time: 603 ms

Training - XG Boost (optimized)
Wall time: 864 ms


## Predict test data

In [192]:
for classifier in classifiers:
    print('\nPredicting - ' + classifier[0])
    %time classifier.append(classifier[1].predict(x_test))


Predicting - Decision Tree
Wall time: 5 ms

Predicting - K Nearest Neighbor
Wall time: 302 ms

Predicting - Random Forest
Wall time: 46 ms

Predicting - XG Boost
Wall time: 22 ms

Predicting - Decision Tree (optimized)
Wall time: 2 ms

Predicting - K Nearest Neighbor (optimized)
Wall time: 320 ms

Predicting - Random Forest (optimized)
Wall time: 77 ms

Predicting - XG Boost (optimized)
Wall time: 14 ms


## Calculate accuracy

In [193]:
for classifier in classifiers:
    print('\nAccuracy - ' + classifier[0])
    print(accuracy_score(y_test, classifier[2]))


Accuracy - Decision Tree
0.6003305582761999

Accuracy - K Nearest Neighbor
0.5890058765915769

Accuracy - Random Forest
0.682664054848188

Accuracy - XG Boost
0.6765426052889324

Accuracy - Decision Tree (optimized)
0.6575048971596474

Accuracy - K Nearest Neighbor (optimized)
0.6013099902056807

Accuracy - Random Forest (optimized)
0.6771547502448579

Accuracy - XG Boost (optimized)
0.6783178256611165


## Accuracy of baseline classification

In [194]:
len(df[df['target_category'] == 1]) / len(df)

0.6237512242899118