# Data prediction

## Imports

In [1]:
from pandas import DataFrame
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

## Data preprocessing

In [2]:
df = pd.read_csv('/data/DEvideos.csv',
    low_memory=False)

df['trending_date'] = df.apply(lambda row: datetime.strptime(row['trending_date'], '%y.%d.%m'), axis=1)
df['publish_time'] = df.apply(lambda row: datetime.strptime(row['publish_time'], '%Y-%m-%dT%H:%M:%S.000Z'), axis=1)
df['days_until_trending'] = df.apply(lambda row: ((row['trending_date'] - row['publish_time']).days + 1), axis=1)

df['tags_count'] = df.apply(lambda row: len(row['tags'].split('|')), axis=1)
df['publish_hour'] = df['publish_time'].map(lambda x: x.hour)
df['publish_month'] = df['publish_time'].map(lambda x: x.month)
df['publish_year'] = df['publish_time'].map(lambda x: x.year)
df['publish_day_of_month'] = df['publish_time'].map(lambda x: x.day)
df['publish_weekday'] = df['publish_time'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday
df['trending_weekday'] = df['trending_date'].map(lambda x: x.weekday()) # 0: Monday, 6: Sunday

df['like_dislike_ratio'] = df.apply(lambda row: row['likes'] / (row['dislikes'] + 1), axis=1)
df['like_view_ratio'] = df.apply(lambda row: row['likes'] / (row['views'] + 1), axis=1)

df['ratings'] = df['likes'] + df['dislikes']
df['likes_per_rating'] = df['likes'] / df['ratings']
df['ratings_per_view'] = df['ratings'] / df['views']
df['comments_per_view'] = df['comment_count'] / df['views']

# Using int instead of cat
def assign_target_category(row):
    if row['days_until_trending'] == 0: 
        return 0
    elif row['days_until_trending'] == 1:
        return 1
    elif row['days_until_trending'] == 2:
        return 2
    elif row['days_until_trending'] <= 5:
        return 3
    else:
        return 6

df['target_category'] = df.apply(assign_target_category, axis=1)

N = len(df)
dropColumns = ['video_id', 'title', 'tags', 'thumbnail_link', 'description']
for column in df.columns:
    numberOfUniqueValues = df[column].nunique()
    if numberOfUniqueValues < 2:
        dropColumns.append(column)
    elif df[column].dtype == 'object' and numberOfUniqueValues > N * 0.9:
        dropColumns.append(column)
    elif df[column].isna().sum() / N > 0.95:
        dropColumns.append(column)
        
df.drop(columns=dropColumns, inplace=True)

df['channel_title'] = df['channel_title'].astype('category')

## Encode features

In [34]:
x_df = DataFrame(index=df.index)
features = ['views', 'comment_count', 'ratings']
for feature in features:
    feature_data = df[feature]
    if df[feature].dtype.name == 'category':
        x_label_encoder = preprocessing.LabelEncoder()
        x_label_encoder.fit(feature_data.astype(str))
        x_df[feature] = x_label_encoder.transform(feature_data)
    elif df[feature].dtype.name == 'datetime64[ns]':
        x_df[feature] = feature_data.to_seconds()
    elif df[feature].dtype.name == 'bool':
        x_df[feature] = int(feature_data)
    else:
        x_df[feature] = feature_data

x = np.reshape(x_df, (-1, len(x_df.columns)))

## Encode prediction target

In [35]:
target = df['target_category'].astype(str)
y_label_encoder = preprocessing.LabelEncoder()
y_label_encoder.fit(target)
y = y_label_encoder.transform(target)

## Create train and test datasubset

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

## Declare classifiers

```
classifiers[:][0]: Name
classifiers[:][1]: Classifier object
classifiers[:][2]: Prediction
```

In [37]:
classifiers = []

classifiers.append(['Decision Tree', DecisionTreeClassifier()])
classifiers.append(['K Nearest Neighbor', KNeighborsClassifier()])
classifiers.append(['Random Forest', RandomForestClassifier()])
classifiers.append(['XG Boost', XGBClassifier(use_label_encoder=False, verbosity=0)])

classifiers.append(['K Nearest Neighbor (optimised)', KNeighborsClassifier(n_neighbors = 1000)])
classifiers.append(['Random Forest (optimised)', RandomForestClassifier(n_estimators=10, max_depth=2, min_samples_split=5)])
classifiers.append(['XG Boost (optimised)', XGBClassifier(n_estimators=50, max_depth=2, use_label_encoder=False, verbosity=0)])

## Train model

In [38]:
for classifier in classifiers:
    print('\nTraining - ' + classifier[0])
    %time classifier[1].fit(x_train, y_train)


Training - Decision Tree
CPU times: user 107 ms, sys: 9.28 ms, total: 116 ms
Wall time: 115 ms

Training - K Nearest Neighbor
CPU times: user 18 ms, sys: 0 ns, total: 18 ms
Wall time: 17.6 ms

Training - Random Forest
CPU times: user 2.91 s, sys: 0 ns, total: 2.91 s
Wall time: 2.91 s

Training - XG Boost
CPU times: user 1min 8s, sys: 3.35 s, total: 1min 12s
Wall time: 4.52 s

Training - K Nearest Neighbor (optimised)
CPU times: user 360 ms, sys: 47.5 ms, total: 407 ms
Wall time: 25.4 ms

Training - Random Forest (optimised)
CPU times: user 750 ms, sys: 27.3 ms, total: 777 ms
Wall time: 73.3 ms

Training - XG Boost (optimised)
CPU times: user 11.2 s, sys: 542 ms, total: 11.7 s
Wall time: 735 ms


## Predict test data

In [39]:
for classifier in classifiers:
    print('\nPredicting - ' + classifier[0])
    %time classifier.append(classifier[1].predict(x_test))


Predicting - Decision Tree
CPU times: user 70.6 ms, sys: 3.83 ms, total: 74.4 ms
Wall time: 4.73 ms

Predicting - K Nearest Neighbor
CPU times: user 2.15 s, sys: 128 ms, total: 2.27 s
Wall time: 353 ms

Predicting - Random Forest
CPU times: user 292 ms, sys: 75 µs, total: 292 ms
Wall time: 291 ms

Predicting - XG Boost
CPU times: user 408 ms, sys: 0 ns, total: 408 ms
Wall time: 27.1 ms

Predicting - K Nearest Neighbor (optimised)
CPU times: user 5.32 s, sys: 284 ms, total: 5.61 s
Wall time: 2.67 s

Predicting - Random Forest (optimised)
CPU times: user 9.32 ms, sys: 0 ns, total: 9.32 ms
Wall time: 8.89 ms

Predicting - XG Boost (optimised)
CPU times: user 80.8 ms, sys: 0 ns, total: 80.8 ms
Wall time: 5.83 ms


## Calculate accuracy

In [40]:
for classifier in classifiers:
    print('\nAccuracy - ' + classifier[0])
    print(accuracy_score(y_test, classifier[2]))


Accuracy - Decision Tree
0.5305460333006856

Accuracy - K Nearest Neighbor
0.6028403525954946

Accuracy - Random Forest
0.6184500489715965

Accuracy - XG Boost
0.6393241919686582

Accuracy - K Nearest Neighbor (optimised)
0.632162095984329

Accuracy - Random Forest (optimised)
0.6264691478942214

Accuracy - XG Boost (optimised)
0.6382223310479922


## Accuracy of baseline classification

In [41]:
len(df[df['target_category'] == 1]) / len(df)

0.6237512242899118