# Data prediction

## Imports

In [1]:
from pandas import DataFrame
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Data preprocessing

In [2]:
df = pd.read_csv('../data/DEvideos.csv',
    low_memory=False)

df['trending_date'] = df.apply(lambda row: datetime.strptime(row['trending_date'], '%y.%d.%m'), axis=1)
df['publish_time'] = df.apply(lambda row: datetime.strptime(row['publish_time'], '%Y-%m-%dT%H:%M:%S.000Z'), axis=1)
df['days_until_trending'] = df.apply(lambda row: ((row['trending_date'] - row['publish_time']).days + 1), axis=1)

df['tags_count'] = df.apply(lambda row: len(row['tags'].split('|')), axis=1)
df['publish_hour'] = df['publish_time'].map(lambda x: x.hour)
df['publish_month'] = df['publish_time'].map(lambda x: x.month)
df['publish_year'] = df['publish_time'].map(lambda x: x.year)
df['publish_day_of_month'] = df['publish_time'].map(lambda x: x.day)
df['publish_weekday'] = df['publish_time'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday

df['like_dislike_ratio'] = df.apply(lambda row: row['likes'] / (row['dislikes'] + 1), axis=1)
df['like_view_ratio'] = df.apply(lambda row: row['likes'] / (row['views'] + 1), axis=1)

df['ratings'] = df['likes'] + df['dislikes']
df['likes_per_rating'] = df.apply(lambda row: 0 if row['ratings'] == 0 else row['likes'] / row['ratings'], axis=1)
df['ratings_per_view'] = df['ratings'] / df['views']
df['comments_per_view'] = df['comment_count'] / df['views']

# Using int instead of cat
def assign_target_category(row):
    if row['days_until_trending'] == 0: 
        return 0
    elif row['days_until_trending'] == 1:
        return 1
    elif row['days_until_trending'] == 2:
        return 2
    elif row['days_until_trending'] <= 5:
        return 3
    else:
        return 6

df['target_category'] = df.apply(assign_target_category, axis=1)
df['channel_title'] = df['channel_title'].astype('category')

### Map tag factor

In [3]:
tag_df = pd.read_csv('../data/tags.csv')
tag_df = tag_df.set_index('tag')
def calculate_tag_factor(tag_string, tag_data):
    tag_list = pd.Series(list(set(map(lambda x: x.strip('\"').lower(), tag_string.split('|')))))
    return tag_list.apply(lambda tag: tag_data['factor'].get(tag, np.nan)).mean(skipna=True)
    
df['tag_factors'] = df['tags'].apply(lambda x: calculate_tag_factor(x, tag_df))
df['tag_factors'] = df.apply(lambda row: 0 if np.isnan(row['tag_factors']) else row['tag_factors'], axis=1)

### Remove unused columns

In [4]:
N = len(df)
dropColumns = ['video_id', 'title', 'tags', 'thumbnail_link', 'description']
for column in df.columns:
    numberOfUniqueValues = df[column].nunique()
    if numberOfUniqueValues < 2:
        dropColumns.append(column)
    elif df[column].dtype == 'object' and numberOfUniqueValues > N * 0.9:
        dropColumns.append(column)
    elif df[column].isna().sum() / N > 0.95:
        dropColumns.append(column)
        
df.drop(columns=dropColumns, inplace=True)

## Encode features

In [5]:
x_df = DataFrame(index=df.index)
features = ['views', 'publish_hour']
for feature in features:
    feature_data = df[feature]
    if df[feature].dtype.name == 'category':
        x_label_encoder = preprocessing.LabelEncoder()
        x_label_encoder.fit(feature_data.astype(str))
        x_df[feature] = x_label_encoder.transform(feature_data)
    elif df[feature].dtype.name == 'datetime64[ns]':
        x_df[feature] = feature_data.to_seconds()
    elif df[feature].dtype.name == 'bool':
        x_df[feature] = int(feature_data)
    else:
        x_df[feature] = feature_data

x = np.reshape(x_df, (-1, len(x_df.columns)))

## Encode prediction target

In [6]:
target = df['target_category'].astype(str)
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(target)
y = y_label_encoder.transform(target)

## Create train and test datasubset

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=0)

## Declare classifiers

```
classifiers[:][0]: Name
classifiers[:][1]: Classifier object
classifiers[:][2]: Prediction
```

In [8]:
classifiers = []

classifiers.append(['Decision Tree', DecisionTreeClassifier()])
classifiers.append(['K Nearest Neighbor', KNeighborsClassifier(n_jobs=-1)])
classifiers.append(['Random Forest', RandomForestClassifier(n_jobs=-1)])
classifiers.append(['XG Boost', XGBClassifier(use_label_encoder=False, verbosity=0)])

classifiers.append(['K Nearest Neighbor (optimised)', KNeighborsClassifier(n_jobs=-1, algorithm='ball_tree', leaf_size=20, n_neighbors=8, p=1, weights='uniform')])
classifiers.append(['Random Forest (optimised)', RandomForestClassifier(n_jobs=-1, n_estimators=10, max_depth=2, min_samples_split=5)])
classifiers.append(['XG Boost (optimised)', XGBClassifier(use_label_encoder=False, verbosity=0, n_estimators=50, max_depth=2)])

## Train model

In [9]:
for classifier in classifiers:
    print('\nTraining - ' + classifier[0])
    %time classifier[1].fit(x_train, y_train)


Training - Decision Tree
Wall time: 105 ms

Training - K Nearest Neighbor
Wall time: 150 ms

Training - Random Forest
Wall time: 879 ms

Training - XG Boost
Wall time: 8.03 s

Training - K Nearest Neighbor (optimised)
Wall time: 38.3 ms

Training - Random Forest (optimised)
Wall time: 85.7 ms

Training - XG Boost (optimised)
Wall time: 1.56 s


## Predict test data

In [10]:
for classifier in classifiers:
    print('\nPredicting - ' + classifier[0])
    %time classifier.append(classifier[1].predict(x_test))


Predicting - Decision Tree
Wall time: 11.6 ms

Predicting - K Nearest Neighbor
Wall time: 471 ms

Predicting - Random Forest
Wall time: 115 ms

Predicting - XG Boost
Wall time: 34 ms

Predicting - K Nearest Neighbor (optimised)
Wall time: 491 ms

Predicting - Random Forest (optimised)
Wall time: 17 ms

Predicting - XG Boost (optimised)
Wall time: 10 ms


## Calculate accuracy

In [11]:
for classifier in classifiers:
    print('\nAccuracy - ' + classifier[0])
    print(accuracy_score(y_test, classifier[2]))


Accuracy - Decision Tree
0.5325661116552399

Accuracy - K Nearest Neighbor
0.589128305582762

Accuracy - Random Forest
0.5377081292850147

Accuracy - XG Boost
0.6394466209598433

Accuracy - K Nearest Neighbor (optimised)
0.6012487757100882

Accuracy - Random Forest (optimised)
0.6275097943192948

Accuracy - XG Boost (optimised)
0.6480166503428012


## Accuracy of baseline classification

In [12]:
len(df[df['target_category'] == 1]) / len(df)

0.6237512242899118