In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#Machine Learning imports 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.cross_validation import train_test_split
from sklearn import metrics


from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

from sklearn import preprocessing
from sklearn.decomposition import PCA
from scipy.stats import skew



In [21]:
train = pd.read_csv("ad_org_train.csv")
test = pd.read_csv("ad_org_test.csv")
train.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,category
0,VID_18655,40,1031602,8523,363,1095,2016-09-14,PT7M37S,F
1,VID_14135,2,1707,56,2,6,2016-10-01,PT9M30S,D
2,VID_2187,1,2023,25,0,2,2016-07-02,PT2M16S,C
3,VID_23096,6,620860,777,161,153,2016-07-27,PT4M22S,H
4,VID_10175,1,666,1,0,0,2016-06-29,PT31S,D


In [22]:
columns = ['views', 'likes', 'dislikes', 'comment']
for column in columns:
        train[column] = train[column].apply(pd.to_numeric, errors = 'coerce')
        test[column] = test[column].apply(pd.to_numeric, errors = 'coerce')        
        print (column, train[column].isnull().sum())
        print (column, test[column].isnull().sum())

views 2
views 1
likes 155
likes 88
dislikes 155
dislikes 88
comment 233
comment 151


In [23]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [24]:
def time_transform(data):
    duration = data['duration']
    h_pos = 0
    m_pos = 0
    s_pos = len(duration) - 1
    time = 0
    #duration = list(duration)
    if 'H' in duration:
        for i in range(2,len(duration)):
            if duration[i] == 'H':
                h_pos = i
                break
        time = int(duration[2:h_pos]) * 3600 

    if 'M' in duration:
        for i in range(2,len(duration)):
            if duration[i] == 'M':
                m_pos = i
                break
        if 'H' in duration:
            time = time + int(duration[h_pos+1:m_pos]) * 60
        else:
            time = int(duration[2:m_pos]) * 60
        
        if 'S' in duration:
            time = time + int(duration[m_pos+1:s_pos])
        
    else:
        if 'H' not in duration:
            time = time + int(duration[2:s_pos])
        else:
            time = time + int(duration[h_pos+1:s_pos])
    #print (duration, m_pos)
    #time = str(time)
    #print (time)
    data['duration'] = time
    data.head()
    return data

train = train.apply(time_transform, axis=1)
test = test.apply(time_transform, axis=1)

In [25]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['category'].values))
train['category'] = lbl.transform(list(train['category'].values))
test['category'] = lbl.transform(list(test['category'].values))

In [26]:
train['published'] = pd.to_datetime(train['published'])
test['published'] = pd.to_datetime(test['published'])

train['dayofweek'] = train.published.dt.dayofweek
test['dayofweek'] = test.published.dt.dayofweek
#quarter didn't work
#weekofyear didn't work

train['year'] = train.published.dt.year
test['year'] = test.published.dt.year

train['month'] = train.published.dt.month
test['month'] = test.published.dt.month

train.drop(['published'], axis=1, inplace=True)
test.drop(['published'], axis=1, inplace=True)

In [27]:
test.drop(['vidid'], axis=1, inplace=True)
train.drop(['vidid'], axis=1, inplace=True)

In [28]:
train.head()

Unnamed: 0,adview,views,likes,dislikes,comment,duration,category,dayofweek,year,month
0,40,1031602.0,8523.0,363.0,1095.0,457,5,2,2016,9
1,2,1707.0,56.0,2.0,6.0,570,3,5,2016,10
2,1,2023.0,25.0,0.0,2.0,136,2,5,2016,7
3,6,620860.0,777.0,161.0,153.0,262,7,2,2016,7
4,1,666.0,1.0,0.0,0.0,31,3,2,2016,6


In [18]:
columns = ['views', 'likes', 'dislikes', 'comment', 'duration',
       'category', 'dayofweek', 'year', 'month']

In [19]:
std_scaler = preprocessing.StandardScaler()
std_scaler.fit(train[columns])
train[columns] = std_scaler.transform(train[columns])

In [20]:
train.head()

Unnamed: 0,adview,views,likes,dislikes,comment,duration,category,dayofweek,year,month
0,40,0.117294,0.641871,0.105435,0.453877,-0.251724,0.883593,-0.408331,0.882527,0.715776
1,2,-0.259766,-0.305413,-0.245139,-0.266778,-0.202949,-0.385234,1.16629,0.882527,1.019567
2,1,-0.259651,-0.308882,-0.247081,-0.269425,-0.390281,-1.019648,1.16629,0.882527,0.108196
3,6,-0.033085,-0.224748,-0.090731,-0.1695,-0.335894,2.152421,-0.408331,0.882527,0.108196
4,1,-0.260147,-0.311567,-0.247081,-0.270749,-0.435603,-0.385234,-0.408331,0.882527,-0.195595


In [None]:
def display_metrics(y_test, y_pred):
    print ("MAE : %f" % metrics.mean_absolute_error(y_test, y_pred))
    print ("RMSE : %f" % np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    #print ("NRMSE : %f" % np.mean(np.abs((y_test - y_pred)/y_test)) * 100)
    print ("R2 score: %f" % metrics.r2_score(y_test, y_pred))
    print ("\n")
           
def model_train(model,X,y):
    # Split test, train
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print ('model_score', model.score(X_train, y_train))
    display_metrics(y_test, y_pred)
    plt.figure(figsize= (50,10))
    preds = pd.DataFrame({"Predicted":model.predict(X_test), "true":y_test})
    preds["Difference"] = preds["true"] - preds["Predicted"]
    preds.plot(x = "Predicted", y = "Difference",kind = "scatter", title = "Residual Features")
    plt.show()
    
def training(X,y):   
    
    #Random Forests
    rand_clf = RandomForestRegressor(n_estimators = 300,max_features='sqrt')
    print ("Random Forests")
    model_train(rand_clf, X, y)

    #MLP Regression
    mlp_clf = MLPRegressor(solver = 'lbfgs', alpha = 0.01, hidden_layer_sizes=(10,10), random_state= 1)
    print ("MLP (NN)")
    model_train(mlp_clf, X, y)
    
    
    return rand_clf, mlp_clf

rand_reg, mlp_reg = training(x_train, y_train)