In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4
import json
import glob
import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score

%matplotlib inline

In [2]:
df = pd.read_csv('data_with_all_labels.csv', header=0)
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['title']

In [3]:
duration = df['duration_code'].str.extract('PT(\d+)M(\d+)')

minute = duration[0].map(int)
seconds = duration[1].map(int)

df_clean['duration'] = round((minute*60 + seconds)/60, 2) #testando duração em minutos dessa vez

In [4]:
subscribers = df['subscribers'].str.replace(',','.')

def convert_unity(num):
    
    if 'mil' in num:
        num = num.split()[0]
        return float(num)*10**3
    
    elif 'mi' in num:
        num = num.split()[0]
        return float(num)*10**6
    
    else:
        return float(num)
        
subscribers = subscribers.map(convert_unity).astype(int)

df_clean['subscribers'] = subscribers

In [5]:
df_clean['views'] = df['views'].str.replace('.', '').astype(int)

In [6]:
df_clean['likes'] = df['likes']
df_clean['dislikes'] = df['dislikes']

In [7]:
date = df['date'].str.extract('(\d+) de (\w+)\. de (\d+)')
date[0] = date[0].map(lambda x: '0'+x if len(x)<2 else x)

months_map = {'jan':'01',
             'fev': '02',
             'mar': '03',
             'abr':'04',
             'mai':'05',
             'jun':'06',
             'jul':'07',
             'ago':'08',
             'set':'09',
             'out':'10',
             'nov':'11',
             'dez':'12',}
date[1] = date[1].map(months_map)

date = date.apply(lambda x: '-'.join(x), axis=1)
df_clean['date']= pd.to_datetime(date, format='%d-%m-%Y')

In [8]:
features = pd.DataFrame(index=df.index)
y = df['y'].copy()

In [9]:
features = pd.DataFrame(index=df.index)
y = df['y'].copy()
features['time_pub'] = (pd.to_datetime('2020-04-23') - df_clean['date']) / np.timedelta64(1, 'D')

features['views'] = df_clean['views']
features['views_for_day'] = round(features['views'] / features['time_pub'], 2)

features['likes'] = df_clean['likes']
features['likes_for_day'] = round(features['likes'] / features['time_pub'], 2)

features['dislikes'] = df_clean['dislikes']
features['dislikes_for_day'] = round(features['dislikes'] / features['time_pub'], 2)

features = features.drop(['time_pub'], axis=1)

features['subscribers'] = df_clean['subscribers']

features['duration_min'] = df_clean['duration']

In [10]:
train_mask = (df_clean['date'] < '2019-07-31')
valid_mask = (df_clean['date'] >= '2019-07-31' )

In [11]:
xtrain, xvalid = features[train_mask], features[valid_mask]
ytrain, yvalid = y[train_mask], y[valid_mask]

title_train = df_clean[train_mask]['title']
title_valid = df_clean[valid_mask]['title']

In [12]:
title_vect = TfidfVectorizer(min_df=1) #,  ngram_range=(1,1)

title_bow_train = title_vect.fit_transform(title_train)
title_bow_valid = title_vect.transform(title_valid)

In [13]:
xtrain_wtitle = hstack([xtrain, title_bow_train])
xvalid_wtitle = hstack([xvalid, title_bow_valid])

In [14]:
xvalid_wtitle

<464x1358 sparse matrix of type '<class 'numpy.float64'>'
	with 6669 stored elements in COOrdinate format>

In [15]:
mdl= RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42,
                            class_weight='balanced', min_samples_split=2)
mdl.fit(xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
p = mdl.predict_proba(xvalid_wtitle)[:,1]

In [17]:
average_precision_score(yvalid, p), roc_auc_score(yvalid, p)

(0.7710727306881153, 0.9921541872761386)