In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#Machine Learning imports 
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import boxcox
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from scipy.stats import skew
from sklearn.cross_validation import train_test_split



In [2]:
train = pd.read_csv("ad_org_train.csv")
test = pd.read_csv("ad_org_test.csv")
train.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,category
0,VID_18655,40,1031602,8523,363,1095,2016-09-14,PT7M37S,F
1,VID_14135,2,1707,56,2,6,2016-10-01,PT9M30S,D
2,VID_2187,1,2023,25,0,2,2016-07-02,PT2M16S,C
3,VID_23096,6,620860,777,161,153,2016-07-27,PT4M22S,H
4,VID_10175,1,666,1,0,0,2016-06-29,PT31S,D


In [3]:
#Removing null values in the data
columns = ['views', 'likes', 'dislikes', 'comment']
for column in columns:
        train[column] = train[column].apply(pd.to_numeric, errors = 'coerce')
        test[column] = test[column].apply(pd.to_numeric, errors = 'coerce')        
        print (column, train[column].isnull().sum())
        print (column, test[column].isnull().sum())
train.dropna(inplace=True)
test.dropna(inplace=True)

views 2
views 1
likes 155
likes 88
dislikes 155
dislikes 88
comment 233
comment 151


In [4]:
#Changing the format of duration into seconds
def time_transform(data):
    duration = data['duration']
    h_pos = 0
    m_pos = 0
    s_pos = len(duration) - 1
    time = 0
    #duration = list(duration)
    if 'H' in duration:
        for i in range(2,len(duration)):
            if duration[i] == 'H':
                h_pos = i
                break
        time = int(duration[2:h_pos]) * 3600 

    if 'M' in duration:
        for i in range(2,len(duration)):
            if duration[i] == 'M':
                m_pos = i
                break
        if 'H' in duration:
            time = time + int(duration[h_pos+1:m_pos]) * 60
        else:
            time = int(duration[2:m_pos]) * 60
        
        if 'S' in duration:
            time = time + int(duration[m_pos+1:s_pos])
        
    else:
        if 'H' not in duration:
            time = time + int(duration[2:s_pos])
        else:
            time = time + int(duration[h_pos+1:s_pos])
    #print (duration, m_pos)
    #time = str(time)
    #print (time)
    data['duration'] = time
    data.head()
    return data

train = train.apply(time_transform, axis=1)
test = test.apply(time_transform, axis=1)

In [5]:
#Creating dummy variables for category column
dummies = pd.get_dummies(train.category)
train = train.join(dummies)
test = test.join(dummies)

In [6]:
train.drop(['category'], axis=1,inplace=True)
test.drop(['category'],axis=1, inplace=True)
train.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,A,B,C,D,E,F,G,H
0,VID_18655,40,1031602.0,8523.0,363.0,1095.0,2016-09-14,457,0,0,0,0,0,1,0,0
1,VID_14135,2,1707.0,56.0,2.0,6.0,2016-10-01,570,0,0,0,1,0,0,0,0
2,VID_2187,1,2023.0,25.0,0.0,2.0,2016-07-02,136,0,0,1,0,0,0,0,0
3,VID_23096,6,620860.0,777.0,161.0,153.0,2016-07-27,262,0,0,0,0,0,0,0,1
4,VID_10175,1,666.0,1.0,0.0,0.0,2016-06-29,31,0,0,0,1,0,0,0,0


In [7]:
test.dropna(inplace=True)

In [8]:
#After analysing model and exploring visual, outliers were detected. Removing these outliers
train = train.loc[train.adview < 2000000]
train = train[train.index != 1742]
train = train[train.index != 1884]
train = train[train.index != 1198]

In [9]:
#Changing the data of video published into datetime dtype
train['published'] = pd.to_datetime(train['published'])
test['published'] = pd.to_datetime(test['published'])

#Extracting important data from date of video published
#Creating new features
train['dayofweek'] = train.published.dt.dayofweek
test['dayofweek'] = test.published.dt.dayofweek
#quarter didn't work
#weekofyear didn't work

train['year'] = train.published.dt.year
test['year'] = test.published.dt.year

train['month'] = train.published.dt.month
test['month'] = test.published.dt.month

train.drop(['published'], axis=1, inplace=True)
test.drop(['published'], axis=1, inplace=True)

In [10]:
#The data was very skewed, therefore boxcox transformation was applied. log1p transformation was also tested
train['adview'], lambda_ = boxcox(train['adview'])
train['views'] = boxcox(train['views'] + 1)[0]
train['likes'] = boxcox(train['likes']+ 1)[0]
train['dislikes'] = boxcox(train['dislikes']+ 1)[0]
train['comment'] = boxcox(train['comment']+ 1)[0]
train['duration'] = boxcox(train['duration']+ 1)[0]

test['views'] = boxcox(test['views'] + 1)[0]
test['likes'] = boxcox(test['likes']+ 1)[0]
test['dislikes'] = boxcox(test['dislikes']+ 1)[0]
test['comment'] = boxcox(test['comment']+ 1)[0]
test['duration'] = boxcox(test['duration']+ 1)[0]

In [11]:
#Changing the range of year variable for manipulating new features from it
train.year = train.year.values - 2005
test.year = test.year.values - 2005

In [12]:
#FEATURE ENGINEERING

train['v_l_r'] = train['views'] / train['likes']
train['v_dur_r'] = train['views'] / train['duration']
train['l_dis_r'] = train['likes'] / train['dislikes']
train['l_dur_r'] = train['likes'] / train['duration']
train['v_dur_p'] = train['views'] * train['duration']
train['v_year_p'] = train['views'] * train['year']
train['l_year_p'] = train['likes'] * train['year']
diff = train['likes'] - train['dislikes']
train['diff_square'] = diff.pow(2)
train['c_dur_r'] = train['comment'] / train['duration']
train['views_square'] = train.views.pow(2)
train['views_permon'] = train['views'] * train['month']
train['comment_permon'] = train['comment'] * train['month']
train['likes_permon'] = train['likes'] * train['month']

test['v_l_r'] = test['views'] / test['likes']
test['v_dur_r'] = test['views'] / test['duration']
test['l_dis_r'] = test['likes'] / test['dislikes']
test['l_dur_r'] = test['likes'] / test['duration']
test['v_dur_p'] = test['views'] * test['duration']
test['v_year_p'] = test['views'] * test['year']
test['l_year_p'] = test['likes'] * test['year']
diff = test['likes'] - test['dislikes']
test['diff_square'] = diff.pow(2)
test['c_dur_r'] = test['comment'] / test['duration']
test['views_square'] = test.views.pow(2)
test['views_permon'] = test['views'] * test['month']
test['comment_permon'] = test['comment'] * test['month']
test['likes_permon'] = test['likes'] * test['month']


In [13]:
#Replacing inf values with NaN for easier manipulation
train = train.replace([np.inf, -np.inf], np.NaN)
test = test.replace([np.inf, -np.inf], np.NaN)

In [14]:
train.v_l_r.fillna(train.v_l_r.mean(), inplace=True)
train.v_dur_r.fillna(train.v_dur_r.mean(), inplace=True)
train.l_dis_r.fillna(train.l_dis_r.mean(), inplace=True)
train.l_dur_r.fillna(train.l_dur_r.mean(), inplace=True)
train.c_dur_r.fillna(train.c_dur_r.mean(), inplace=True)

test.v_l_r.fillna(test.v_l_r.mean(), inplace=True)
test.v_dur_r.fillna(test.v_dur_r.mean(), inplace=True)
test.l_dis_r.fillna(test.l_dis_r.mean(), inplace=True)
test.l_dur_r.fillna(test.l_dur_r.mean(), inplace=True)
test.c_dur_r.fillna(test.c_dur_r.mean(), inplace=True)

In [15]:
X_test = test.drop(['vidid'], axis=1).copy()
train.drop(['vidid'], axis=1, inplace=True)

In [16]:
train_y = train['adview']
train = train.drop(['adview'], axis = 1)

In [None]:
#Random Forest model with GridSearch for hyperparameter tuning. 5-fold cross validation was also carried out by using 
#GridSearchCV. The parameters have already been figured out.
rf = RandomForestRegressor(max_features='auto')
params = {     "min_samples_leaf" : [3]
              , "min_samples_split" : [ 5]
              , "n_estimators": [ 300]
         }
GS = GridSearchCV(estimator=rf, param_grid=params, cv=5,n_jobs=-1)
GS= GS.fit(train,train_y)
print(GS.best_score_)
print(GS.best_params_)

In [18]:
GS.score(train, train_y)

0.79993626393096684

In [19]:
Y_pred = GS.predict(X_test)

In [20]:
# Since the data was transformed using boxcox transformation. We have to reverse the transformation 
# for comprehensible results. Scipy.stats doesn't have an inverse boxcox transformation yet, so I manually inversed it.
# The function has been checked/verified on temp data.
Y_pred = np.power((Y_pred * lambda_) + 1, 1 / lambda_) 

In [22]:
submission = pd.DataFrame({
        "vid_id": test["vidid"],
        "ad_view": Y_pred
    })
submission.to_csv('adview.csv', index=False)