In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4
import json
import glob
import tqdm

%matplotlib inline

In [2]:
df = pd.read_json('data_from_mdl.json')
df['duration'] = (df['duration']/1000).map(int)
#ao salvar o DF data_from_mdl do arquivo mdl1, o coluna 'duration' que era timedelta64 virou int64
#ex:   1:30 viratam 90000

In [3]:
features = pd.DataFrame(index=df.index)
y = df['y'].copy()
features['time_pub'] = (pd.to_datetime('2020-04-23') - df['date']) / np.timedelta64(1, 'D')

features['views'] = df['views']
features['views_for_day'] = round(features['views'] / features['time_pub'], 2)

features['likes'] = df['likes']
features['likes_for_day'] = round(features['likes'] / features['time_pub'], 2)

features['dislikes'] = df['dislikes']
features['dislikes_for_day'] = round(features['dislikes'] / features['time_pub'], 2)

features = features.drop(['time_pub'], axis=1)

features['subscribers'] = df['subscribers']

In [4]:
#duração dos videos em segundos
features['duration_seconds'] = df['duration']

In [5]:
features.head()

Unnamed: 0,views,views_for_day,likes,likes_for_day,dislikes,dislikes_for_day,subscribers,duration_seconds
0,175311,281.85,5131,8.25,65,0.1,173000,29758
1,34724,58.07,782,1.31,20,0.03,484000,584
2,18021,462.08,680,17.44,8,0.21,1620000,33776
3,4193,11.75,94,0.26,3,0.01,2840,4513
4,103756,165.48,3167,5.05,155,0.25,702000,501


In [6]:
train_mask = df['date'] < '2019-07-31'
valid_mask = df['date'] >= '2019-07-31'

xtrain, xvalid = features[train_mask], features[valid_mask]
ytrain, yvalid = y[train_mask], y[valid_mask]
xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape

((293, 8), (228, 8), (293,), (228,))

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
#transforma text features em uma matriz Bag of Words (bow)
#https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

title_train = df[train_mask]['title']
title_valid = df[valid_mask]['title']

title_vect = TfidfVectorizer(min_df=2)
#min_df, se int, é o num min de exemplos que uma palavra precisa aparecer pra ser considerada, 
#se float, é o percentual min

title_bow_train = title_vect.fit_transform(title_train)
title_bow_valid = title_vect.transform(title_valid) 
#só transform pois valid simula os dados que não vão ser conhecidos pelo modelo

#TfidfVectorizer retorna sparse matrices (matriz que não salva os 0s na mémoria, economizando muito espaço)

title_bow_train

<293x217 sparse matrix of type '<class 'numpy.float64'>'
	with 1813 stored elements in Compressed Sparse Row format>

In [8]:
1813/(293*217) #num de elementos salvos na matriz (num diferentes de 0s)

0.028514807882858085

In [9]:
from scipy.sparse import hstack, vstack

hstack - [1, 2]  [3, 4] -> [1, 2, 3, 4]

vstack - [1, 2]  [3, 4] -> [1, 2]
                           [3, 4]


numpy também tem hstack e vstack, mas as funções nele não são otimizadas para sparse matrices, levaria muito mais tempo 

In [10]:
xtrain_wtitle = hstack([xtrain, title_bow_train])
xvalid_wtitle = hstack([xvalid, title_bow_valid])
xtrain_wtitle.shape #217 colunas dos title 2 de views e views_for_day

(293, 225)

# 1 Novo Model (mdl2)

In [11]:
from sklearn.ensemble import RandomForestClassifier
mdl= RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, class_weight='balanced')
mdl.fit(xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
p = mdl.predict_proba(xvalid_wtitle)[:,1]

In [17]:
type(yvalid)

pandas.core.series.Series

In [13]:
from sklearn.metrics import average_precision_score, roc_auc_score
average_precision_score(yvalid, p), roc_auc_score(yvalid, p)

(0.7931746031746032, 0.9674311926605504)

# 2 Active Learning

In [14]:
all_data = pd.read_csv('data_with_label.csv', index_col=0)
df_unlabeled = all_data[all_data['y'].isnull()]
df_unlabeled.head()

Unnamed: 0,title,duration_code,y,link,subscribers,views,likes,dislikes,date,thumbnailUrl
515,Learn Python Basics for Data Science from IBM,PT2M29S,,https://www.youtube.com/watch?v=JC3urnvKanI.html,245 mil,9.332,74,1,Publicado em 4 de abr. de 2019,https://i.ytimg.com/vi/JC3urnvKanI/maxresdefau...
516,What is Machine Learning ? (Tamil),PT19M20S,,https://www.youtube.com/watch?v=jdNDhCq5s0M.html,495,70.0,3,1,Publicado em 20 de abr. de 2020,https://i.ytimg.com/vi/jdNDhCq5s0M/maxresdefau...
517,Интенсив Data Science. Подведение итогов,PT87M16S,,https://www.youtube.com/watch?v=jDntIbdSCks.html,"41,7 mil",1.705,29,2,Transmitido ao vivo em 18 de abr. de 2020,https://i.ytimg.com/vi/jDntIbdSCks/maxresdefau...
518,Module 1: Introduction to Data Science for Soc...,PT52M19S,,https://www.youtube.com/watch?v=jG6pdz4DQu8.html,305,4.257,92,1,Publicado em 3 de jan. de 2018,https://i.ytimg.com/vi/jG6pdz4DQu8/maxresdefau...
519,Python Basics for Data Science - Functions,PT13M29S,,https://www.youtube.com/watch?v=jG73BfTEfvs.html,245 mil,1.344,10,1,Publicado em 4 de abr. de 2019,https://i.ytimg.com/vi/jG73BfTEfvs/maxresdefau...


### 2.1 Cleaning df_unlabeled

In [15]:
date = df_unlabeled['date'].str.extract('(\d+) de (\w+)\. de (\d+)')

date[0] = date[0].map(lambda x: '0'+x if len(x)<2 else x) #add a 0 in days between 1 and 9

months_map = {'jan':'01',
             'fev': '02',
             'mar': '03',
             'abr':'04',
             'mai':'05',
             'jun':'06',
             'jul':'07',
             'ago':'08',
             'set':'09',
             'out':'10',
             'nov':'11',
             'dez':'12',}
date[1] = date[1].map(months_map) #convert the months for numbers

date = date.apply(lambda x: '-'.join(x), axis=1) #merges columns, joining str with '-'

df_unlabeled['date']= pd.to_datetime(date, format='%d-%m-%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
duration = df_unlabeled['duration_code'].str.extract('PT(\d+)M(\d+)')

minute = duration[0].map(int)
seconds = duration[1].map(int)

df_unlabeled['duration_code'] = minute*60 + seconds

df_unlabeled.columns = ['title', 'duration', 'y', 'link', 'subscribers', 'views', 'likes',
       'dislikes', 'date', 'thumbnailUrl']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
subscribers = df_unlabeled['subscribers'].str.replace(',','.')

def convert_unity(num):
    
    if 'mil' in num:
        num = num.split()[0]
        return float(num)*10**3
    
    elif 'mi' in num:
        num = num.split()[0]
        return float(num)*10**6
    
    else:
        return float(num)
        
subscribers = subscribers.map(convert_unity).astype(int)

df_unlabeled['subscribers'] = subscribers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
df_unlabeled['views'] = df_unlabeled['views'].str.replace('.', '').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### 2.2 Criando xu (x das features do df unlabeled)

In [19]:
features_u = pd.DataFrame(index=df_unlabeled.index)

features_u['time_pub'] = (pd.to_datetime('2020-04-23') - df_unlabeled['date']) / np.timedelta64(1, 'D')

features_u['views'] = df_unlabeled['views']
features_u['views_for_day'] = round(features_u['views'] / features_u['time_pub'], 2)

features_u['likes'] = df_unlabeled['likes']
features_u['likes_for_day'] = round(features_u['likes'] / features_u['time_pub'], 2)

features_u['dislikes'] = df_unlabeled['dislikes']
features_u['dislikes_for_day'] = round(features_u['dislikes'] / features_u['time_pub'], 2)

features_u = features_u.drop(['time_pub'], axis=1)

features_u['subscribers'] = df_unlabeled['subscribers']
features_u['duration_seconds'] = df_unlabeled['duration']

title_u = df_unlabeled['title']
title_bow_u = title_vect.transform(title_u)
xu_wtitle = hstack([features_u, title_bow_u])

### 2.3 Resultados do mdl no xu

In [20]:
pu = mdl.predict_proba(xu_wtitle)[:,1]
df_unlabeled['p'] = pu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
df_unlabeled.head(1)

Unnamed: 0,title,duration,y,link,subscribers,views,likes,dislikes,date,thumbnailUrl,p
515,Learn Python Basics for Data Science from IBM,149,,https://www.youtube.com/watch?v=JC3urnvKanI.html,245000,9332,74,1,2019-04-04,https://i.ytimg.com/vi/JC3urnvKanI/maxresdefau...,0.004


### 2.4 Separando os exemplos que o modelo teve mais dificuldade (70) e aleatórios (30)

In [22]:
mask_h = (df_unlabeled['p'] >= 0.067) & (df_unlabeled['p'] <= 0.99)
mask_h.sum() #num de exemplos no intervalo

73

In [23]:
difficult = df_unlabeled[mask_h]
random = df_unlabeled[~mask_h].sample(27, random_state=42) 
# ~ negação, inverte os Treu e False da serie, para separar os exemplos que não estão nos difficult
#.sample(n) pega um amostra aleatoria de n exemplos

In [24]:
active_label1 = pd.concat([difficult, random])
active_label1.to_csv('active_label1.csv')

In [25]:
df2 = pd.read_csv('active_label_wy.csv', index_col=0)
df2['p'] = active_label1['p'].to_list()
#nos converter o active_label1.csv para anotar os y, a coluna 'p' foi alterada
df2['date'] = df2['date'].astype(np.datetime64)
#e as culuna 'date' mudou de tipo para str
df2['novo'] = 1

# 3 Novo Model (mdl3)

In [26]:
df3 = pd.concat([df, df2.drop('p', axis=1)])
df3 = df3.fillna(0)
#df3 tem os 621 exemplos com label
#coluna 'novo' tem 1 para os 100 novos exemplos e 0 para os 521 antigos

In [27]:
features = pd.DataFrame(index=df3.index)
y = df3['y'].copy()
features['time_pub'] = (pd.to_datetime('2020-04-23') - df3['date']) / np.timedelta64(1, 'D')

features['views'] = df3['views']
features['views_for_day'] = round(features['views'] / features['time_pub'], 2)

features['likes'] = df3['likes']
features['likes_for_day'] = round(features['likes'] / features['time_pub'], 2)

features['dislikes'] = df3['dislikes']
features['dislikes_for_day'] = round(features['dislikes'] / features['time_pub'], 2)

features = features.drop(['time_pub'], axis=1)

features['subscribers'] = df3['subscribers']

features['duration_seconds'] = df3['duration']

### 3.1 Aumentando a validação

In [28]:
#usando os mesmos dados de treino e mais dados para valid
train_mask = (df3['date'] < '2019-07-31') & (df3['novo']==0)
valid_mask = df3['date'] >= '2019-07-31' 

In [29]:
xtrain, xvalid = features[train_mask], features[valid_mask]
ytrain, yvalid = y[train_mask], y[valid_mask]

title_train = df3[train_mask]['title']
title_valid = df3[valid_mask]['title']

title_vect = TfidfVectorizer(min_df=2)

title_bow_train = title_vect.fit_transform(title_train)
title_bow_valid = title_vect.transform(title_valid) 

xtrain_wtitle = hstack([xtrain, title_bow_train])
xvalid_wtitle = hstack([xvalid, title_bow_valid])

In [30]:
mdl= RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, class_weight='balanced')
mdl.fit(xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [31]:
p = mdl.predict_proba(xvalid_wtitle)[:,1]
average_precision_score(yvalid, p), roc_auc_score(yvalid, p)

(0.6486506329927382, 0.9616246498599439)

### 3.2 Aumentando o treino

In [32]:
#usando os mesmos dados de treino e mais dados para valid
train_mask = (df3['date'] < '2019-07-31') 
valid_mask = (df3['date'] >= '2019-07-31' ) & (df3['novo']==0)

In [33]:
xtrain, xvalid = features[train_mask], features[valid_mask]
ytrain, yvalid = y[train_mask], y[valid_mask]

title_train = df3[train_mask]['title']
title_valid = df3[valid_mask]['title']

title_vect = TfidfVectorizer(min_df=2)

title_bow_train = title_vect.fit_transform(title_train)
title_bow_valid = title_vect.transform(title_valid) 

xtrain_wtitle = hstack([xtrain, title_bow_train])
xvalid_wtitle = hstack([xvalid, title_bow_valid])

In [34]:
mdl= RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, class_weight='balanced')
mdl.fit(xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [35]:
p = mdl.predict_proba(xvalid_wtitle)[:,1]
average_precision_score(yvalid, p), roc_auc_score(yvalid, p)

(0.8378618113912232, 0.9880733944954129)

### 3.3 Aumentando treino e validação

In [36]:
train_mask = (df3['date'] < '2019-07-31') 
valid_mask = (df3['date'] >= '2019-07-31' )

In [37]:
xtrain, xvalid = features[train_mask], features[valid_mask]
ytrain, yvalid = y[train_mask], y[valid_mask]

title_train = df3[train_mask]['title']
title_valid = df3[valid_mask]['title']

title_vect = TfidfVectorizer(min_df=2)

title_bow_train = title_vect.fit_transform(title_train)
title_bow_valid = title_vect.transform(title_valid) 

xtrain_wtitle = hstack([xtrain, title_bow_train])
xvalid_wtitle = hstack([xvalid, title_bow_valid])

In [38]:
mdl= RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1, class_weight='balanced')
mdl.fit(xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [39]:
p = mdl.predict_proba(xvalid_wtitle)[:,1]
average_precision_score(yvalid, p), roc_auc_score(yvalid, p)

(0.801072325936417, 0.985434173669468)