In [1]:
import numpy as np
import pandas as pd
import sklearn
import json
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics

from sklearn.model_selection import KFold 
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.metrics import f1_score

## Importando DataSet

In [2]:
filename = 'USvideos.csv'
category_filename = 'US_category_id.json'

videos = pd.read_csv(filename)
videos = videos.dropna(axis=0)
print("DataSet null inputs: \n"+str(videos.isnull().sum())+"\n")
print(filename+" shape: "+str(videos.shape))
videos.head()

DataSet null inputs: 
video_id                  0
trending_date             0
title                     0
channel_title             0
category_id               0
publish_time              0
tags                      0
views                     0
likes                     0
dislikes                  0
comment_count             0
thumbnail_link            0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
description               0
dtype: int64

USvideos.csv shape: (40379, 16)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


# Tratamento do DataSet

## Correção de formato de data-hora para padrão Unix

In [3]:
videos['trending_date'] = pd.to_datetime(videos['trending_date'], format='%y.%d.%m')
videos['publish_time'] = pd.to_datetime(videos['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')

# separates date and time into two columns from 'publish_time' column

videos.insert(4, 'publish_date', videos['publish_time'].dt.date)
videos['publish_time'] = videos['publish_time'].dt.time
videos['publish_date']=pd.to_datetime(videos['publish_date'])

videos.head()

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,2017-11-14,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,2017-11-13,22,17:13:01,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,2017-11-13,24,07:30:00,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,2017-11-14,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,2017-11-12,23,19:05:24,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,2017-11-14,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,2017-11-13,24,11:00:04,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,2017-11-14,I Dare You: GOING BALD!?,nigahiga,2017-11-12,24,18:01:41,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


## Tratando para pegar somente a ultima entrada para cada video (a entrada mais atualizada)

In [4]:
videos_lastentry = videos.sort_values('trending_date', ascending=False).drop_duplicates(['video_id'])
print(filename+" shape: "+str(videos_lastentry.shape))
videos_lastentry.head()

USvideos.csv shape: (6254, 17)


Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
40948,ooyjaVdt-jA,2018-06-14,Official Call of Duty®: Black Ops 4 — Multipla...,Call of Duty,2018-05-17,20,17:09:38,"call of duty|""cod""|""activision""|""Black Ops 4""",10306119,357079,212976,144795,https://i.ytimg.com/vi/ooyjaVdt-jA/default.jpg,False,False,False,Call of Duty: Black Ops 4 Multiplayer raises t...
40811,Nd3zqXro_P0,2018-06-14,Why 350°F is the magic number for baking,Vox,2018-06-07,25,12:00:02,"baking|""maillard reaction""|""s pen""|""Vox.com""|""...",799411,16092,652,1551,https://i.ytimg.com/vi/Nd3zqXro_P0/default.jpg,False,False,False,Turns out there’s a lot of chemistry in cookin...
40821,jWkOwC7vMb4,2018-06-14,“Bonding” with Grandma,itsAlexClark,2018-06-06,1,21:40:34,"its alex clark|""itsalexclark""|""alex clark""|""it...",1004181,48129,889,5483,https://i.ytimg.com/vi/jWkOwC7vMb4/default.jpg,False,False,False,"You know my family, now you need to meet my gr..."
40820,T9UHD30MSGk,2018-06-14,Knife Expert Guesses Cheap vs. Expensive Knive...,Epicurious,2018-06-06,26,18:54:38,"best knife|""boning knife""|""cheap""|""cheap vs ex...",903651,11427,2305,2504,https://i.ytimg.com/vi/T9UHD30MSGk/default.jpg,False,False,False,Epicurious challenges knife expert Geoff Feder...
40819,NB3gWkhLkxM,2018-06-14,The Internet - Come Over (Official Video),TheInternetVEVO,2018-06-06,10,14:00:02,"Columbia|""Come Over""|""R&B/Soul""|""The Internet""",1615678,92403,1856,4572,https://i.ytimg.com/vi/NB3gWkhLkxM/default.jpg,False,False,False,The Internet's Come Over is taken from the new...


## Buscando o nome da categoria pelo identificador e armazenando em nova coluna

In [5]:
with open(category_filename) as f:
    category = json.load(f)

In [6]:
def category_replace(c_id):
    for i in category["items"]:
        if int(c_id) == int(i["id"]):
             return i["snippet"]["title"]
    return "None"

In [7]:
videos_lastentry['category_name'] = videos_lastentry.apply(lambda row: category_replace(row['category_id']), axis=1)
print(filename+" shape: "+str(videos_lastentry.shape))
videos_lastentry.head()

USvideos.csv shape: (6254, 18)


Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name
40948,ooyjaVdt-jA,2018-06-14,Official Call of Duty®: Black Ops 4 — Multipla...,Call of Duty,2018-05-17,20,17:09:38,"call of duty|""cod""|""activision""|""Black Ops 4""",10306119,357079,212976,144795,https://i.ytimg.com/vi/ooyjaVdt-jA/default.jpg,False,False,False,Call of Duty: Black Ops 4 Multiplayer raises t...,Gaming
40811,Nd3zqXro_P0,2018-06-14,Why 350°F is the magic number for baking,Vox,2018-06-07,25,12:00:02,"baking|""maillard reaction""|""s pen""|""Vox.com""|""...",799411,16092,652,1551,https://i.ytimg.com/vi/Nd3zqXro_P0/default.jpg,False,False,False,Turns out there’s a lot of chemistry in cookin...,News & Politics
40821,jWkOwC7vMb4,2018-06-14,“Bonding” with Grandma,itsAlexClark,2018-06-06,1,21:40:34,"its alex clark|""itsalexclark""|""alex clark""|""it...",1004181,48129,889,5483,https://i.ytimg.com/vi/jWkOwC7vMb4/default.jpg,False,False,False,"You know my family, now you need to meet my gr...",Film & Animation
40820,T9UHD30MSGk,2018-06-14,Knife Expert Guesses Cheap vs. Expensive Knive...,Epicurious,2018-06-06,26,18:54:38,"best knife|""boning knife""|""cheap""|""cheap vs ex...",903651,11427,2305,2504,https://i.ytimg.com/vi/T9UHD30MSGk/default.jpg,False,False,False,Epicurious challenges knife expert Geoff Feder...,Howto & Style
40819,NB3gWkhLkxM,2018-06-14,The Internet - Come Over (Official Video),TheInternetVEVO,2018-06-06,10,14:00:02,"Columbia|""Come Over""|""R&B/Soul""|""The Internet""",1615678,92403,1856,4572,https://i.ytimg.com/vi/NB3gWkhLkxM/default.jpg,False,False,False,The Internet's Come Over is taken from the new...,Music


## Transformando categorias em OneHotEncoder

In [8]:
# DESNECESSARIO - UTILIZADO ONEHOTENCODER() NO MAPPER
#videos_lastentry = videos_lastentry.join(pd.get_dummies(videos_lastentry['category_name']))
#videos_lastentry.head()

## Definindo sentimento por video

In [9]:
#positive sentiment = 60% da quantidade de likes+dislikes
videos_lastentry['sentiment'] = videos_lastentry['likes'] >= 0.6*(videos_lastentry['likes']+videos_lastentry['dislikes']) 
videos_lastentry.head()

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name,sentiment
40948,ooyjaVdt-jA,2018-06-14,Official Call of Duty®: Black Ops 4 — Multipla...,Call of Duty,2018-05-17,20,17:09:38,"call of duty|""cod""|""activision""|""Black Ops 4""",10306119,357079,212976,144795,https://i.ytimg.com/vi/ooyjaVdt-jA/default.jpg,False,False,False,Call of Duty: Black Ops 4 Multiplayer raises t...,Gaming,True
40811,Nd3zqXro_P0,2018-06-14,Why 350°F is the magic number for baking,Vox,2018-06-07,25,12:00:02,"baking|""maillard reaction""|""s pen""|""Vox.com""|""...",799411,16092,652,1551,https://i.ytimg.com/vi/Nd3zqXro_P0/default.jpg,False,False,False,Turns out there’s a lot of chemistry in cookin...,News & Politics,True
40821,jWkOwC7vMb4,2018-06-14,“Bonding” with Grandma,itsAlexClark,2018-06-06,1,21:40:34,"its alex clark|""itsalexclark""|""alex clark""|""it...",1004181,48129,889,5483,https://i.ytimg.com/vi/jWkOwC7vMb4/default.jpg,False,False,False,"You know my family, now you need to meet my gr...",Film & Animation,True
40820,T9UHD30MSGk,2018-06-14,Knife Expert Guesses Cheap vs. Expensive Knive...,Epicurious,2018-06-06,26,18:54:38,"best knife|""boning knife""|""cheap""|""cheap vs ex...",903651,11427,2305,2504,https://i.ytimg.com/vi/T9UHD30MSGk/default.jpg,False,False,False,Epicurious challenges knife expert Geoff Feder...,Howto & Style,True
40819,NB3gWkhLkxM,2018-06-14,The Internet - Come Over (Official Video),TheInternetVEVO,2018-06-06,10,14:00:02,"Columbia|""Come Over""|""R&B/Soul""|""The Internet""",1615678,92403,1856,4572,https://i.ytimg.com/vi/NB3gWkhLkxM/default.jpg,False,False,False,The Internet's Come Over is taken from the new...,Music,True


## Normalização das features numéricas

In [10]:
videos_lastentry.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,6254.0,6254.0,6254.0,6254.0,6254.0
mean,20.408539,1975222.0,55924.69,2828.081,6289.861
std,7.256811,7092548.0,194231.5,24332.77,29437.13
min,1.0,559.0,0.0,0.0,0.0
25%,17.0,164887.5,2931.0,130.0,387.0
50%,24.0,527037.5,12224.0,448.5,1287.5
75%,25.0,1490734.0,38985.0,1516.0,4110.75
max,43.0,225211900.0,5613827.0,1643059.0,1228655.0


In [11]:
# Normalização
videos_lastentry['views_nonnormal'] = videos_lastentry['views']
videos_lastentry['views'] = (videos_lastentry['views']-videos_lastentry['views'].min())/(videos_lastentry['views'].max()-videos_lastentry['views'].min())
#videos_lastentry['likes'] = (videos_lastentry['likes']-videos_lastentry['likes'].min())/(videos_lastentry['likes'].max()-videos_lastentry['likes'].min())
#videos_lastentry['dislikes'] = (videos_lastentry['dislikes']-videos_lastentry['dislikes'].min())/(videos_lastentry['dislikes'].max()-videos_lastentry['dislikes'].min())
videos_lastentry['comment_count'] = (videos_lastentry['comment_count']-videos_lastentry['comment_count'].min())/(videos_lastentry['comment_count'].max()-videos_lastentry['comment_count'].min())
videos_lastentry.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count,views_nonnormal
count,6254.0,6254.0,6254.0,6254.0,6254.0,6254.0
mean,20.408539,0.008768,55924.69,2828.081,0.005119,1975222.0
std,7.256811,0.031493,194231.5,24332.77,0.023959,7092548.0
min,1.0,0.0,0.0,0.0,0.0,559.0
25%,17.0,0.00073,2931.0,130.0,0.000315,164887.5
50%,24.0,0.002338,12224.0,448.5,0.001048,527037.5
75%,25.0,0.006617,38985.0,1516.0,0.003346,1490734.0
max,43.0,1.0,5613827.0,1643059.0,1.0,225211900.0


## Separando conjuntos de treino, validação e teste

In [12]:
train_data = videos_lastentry.sample(frac=0.8, random_state=200)
test_data = videos_lastentry.drop(train_data.index)
print(train_data.shape, test_data.shape)

(5003, 20) (1251, 20)


# Classificador

## União de features

In [13]:
mapper = DataFrameMapper([
    ('title', CountVectorizer()),
    ('description', CountVectorizer(stop_words='english', max_features = 5000, strip_accents='ascii')),
    ('tags', CountVectorizer()),
    ('category_name', LabelBinarizer()),
    ('comments_disabled', None),
    ('views', None),
    ('comment_count', None)
])
x_train = mapper.fit_transform(train_data)
y_train = train_data['sentiment'].values

In [14]:
x_train.shape

(5003, 34951)

In [15]:
classifier = LogisticRegression(penalty = 'l1')
classifier.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
x_test = mapper.transform(test_data)
y_test = test_data['sentiment'].values
predicted = classifier.predict(x_test)

In [17]:
probs = classifier.predict_proba(x_test)
print(probs)

[[9.53872917e-05 9.99904613e-01]
 [6.15477870e-03 9.93845221e-01]
 [4.59240509e-04 9.99540759e-01]
 ...
 [1.94636165e-04 9.99805364e-01]
 [4.59324382e-03 9.95406756e-01]
 [1.34099290e-03 9.98659007e-01]]


In [18]:
print("Acuracia: "+str(metrics.accuracy_score(y_test, predicted)))
print("ROC AUC score: "+str(metrics.roc_auc_score(y_test, probs[:, 1])))

Acuracia: 0.9664268585131894
ROC AUC score: 0.8311710851737752


In [19]:
predicted = classifier.predict(mapper.transform(videos_lastentry))

MemoryError: 

In [None]:
videos_output = videos_lastentry
videos_output['predicted_sentiment'] = predicted

videos_output = videos_output.sort_values('predicted_sentiment', ascending=False)
videos_output.head()

In [None]:
f1_score(videos_output['sentiment'].astype(int), videos_output['predicted_sentiment'].astype(int), average='macro')

In [None]:
get_n_features = 10
print("The "+str(get_n_features)+" most positive-weighted words are: ")
for feature_id in sorted(range(len(classifier.coef_[0])), key=lambda i: classifier.coef_[0][i], reverse=True)[:get_n_features]:
    print("%s (%f)" % (mapper.transformed_names_[feature_id], classifier.coef_[0][feature_id]))
print()
print("The "+str(get_n_features)+" most negative-weighted words are: ")
for feature_id in sorted(range(len(classifier.coef_[0])), key=lambda i: classifier.coef_[0][i], reverse=False)[:get_n_features]:
    print("%s (%f)" % (mapper.transformed_names_[feature_id], classifier.coef_[0][feature_id]))

# K-Fold

In [None]:
def run_kfold(clf,X,Y):
    kf = KFold(n_splits=10)
    outcomes = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

In [None]:
run_kfold(classifier,x_train,y_train)

In [None]:
videos_lastentry['like_ratio'] = videos_lastentry['likes']/(videos_lastentry['likes']+videos_lastentry['dislikes'])

In [None]:
from IPython.display import HTML, display

# We choose the 10 most trending videos
selected_columns = ['title', 'channel_title', 'thumbnail_link', 'publish_date', 'category_name', 'likes', 'dislikes', 'views_nonnormal', 'like_ratio']

most_frequent = videos_lastentry.groupby(selected_columns)['video_id'].agg(
    {"code_count": len}).sort_values(by=['like_ratio']
).head(10).reset_index()

# Construction of HTML table with miniature photos assigned to the most popular movies
table_content = ''
max_title_length = 50

for date, row in most_frequent.T.iteritems():
    HTML_row = '<tr>'
    HTML_row += '<td><img src="' + str(row[2]) + '"style="width:100px;height:100px;"></td>'
    HTML_row += '<td>' + str(row[1]) + '</td>'
    HTML_row += '<td>' + str(row[0])  + '</td>'
    HTML_row += '<td>' + str(row[4]) + '</td>'
    HTML_row += '<td>' + str(row[3]) + '</td>'
    HTML_row += '<td>' + str(row[5]) + '</td>'
    HTML_row += '<td>' + str(row[6]) + '</td>'
    HTML_row += '<td>' + str(row[7]) + '</td>'
    
    table_content += HTML_row + '</tr>'

display(HTML(
    '<table><tr><th>Photo</th><th>Channel Name</th><th style="width:250px;">Title</th><th>Category</th><th>Publish Date</th><th>Likes</th><th>Dislikes</th><th>Views</th></tr>{}</table>'.format(table_content))
)

In [None]:
videos_lastentry.loc[videos_lastentry['description'].str.contains("andretti")]