In [1]:
import numpy as np
import pandas as pd
import sklearn
import json
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics

from sklearn.model_selection import KFold 
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.metrics import f1_score

## Importando DataSet

In [2]:
filename = 'DEvideos.csv'
category_filename = 'DE_category_id.json'

videos = pd.read_csv(filename)
videos = videos.dropna(axis=0)
print("DataSet null inputs: \n"+str(videos.isnull().sum())+"\n")
print(filename+" shape: "+str(videos.shape))
videos.head()

DataSet null inputs: 
video_id                  0
trending_date             0
title                     0
channel_title             0
category_id               0
publish_time              0
tags                      0
views                     0
likes                     0
dislikes                  0
comment_count             0
thumbnail_link            0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
description               0
dtype: int64

DEvideos.csv shape: (39288, 16)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,LgVi6y5QIjM,17.14.11,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,24,2017-11-13T17:08:49.000Z,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Heute gibt es mal wieder ein neues Format... w...
1,Bayt7uQith4,17.14.11,Kinder ferngesteuert im Kiosk! Erwachsene abzo...,LUKE! Die Woche und ich,23,2017-11-12T22:30:01.000Z,"Kinder|""ferngesteuert""|""Kinder ferngesteuert""|...",797196,53576,302,1278,https://i.ytimg.com/vi/Bayt7uQith4/default.jpg,False,False,False,Kinder ferngesteuert! Kinder lassen sich sooo ...
2,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
3,AHtypnRk7JE,17.14.11,Das Fermi-Paradoxon,100SekundenPhysik,27,2017-11-12T15:00:01.000Z,"Physik|""Wissenschaft""|""Technik""|""Science-Ficti...",380247,31821,458,1955,https://i.ytimg.com/vi/AHtypnRk7JE/default.jpg,False,False,False,‚ñ∫Alle Videos: http://bit.ly/1fa7Tw3\n\n\n‚úöSnap...
4,ZJ9We4bjcg0,17.14.11,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,24,2017-11-12T13:10:36.000Z,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,18 Song Mashup √ºber den (ver√§nderten) Beat von...


# Tratamento do DataSet

## Corre√ß√£o de formato de data-hora para padr√£o Unix

In [3]:
videos['trending_date'] = pd.to_datetime(videos['trending_date'], format='%y.%d.%m')
videos['publish_time'] = pd.to_datetime(videos['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')

# separates date and time into two columns from 'publish_time' column

videos.insert(4, 'publish_date', videos['publish_time'].dt.date)
videos['publish_time'] = videos['publish_time'].dt.time
videos['publish_date']=pd.to_datetime(videos['publish_date'])

videos.head()

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,LgVi6y5QIjM,2017-11-14,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,2017-11-13,24,17:08:49,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Heute gibt es mal wieder ein neues Format... w...
1,Bayt7uQith4,2017-11-14,Kinder ferngesteuert im Kiosk! Erwachsene abzo...,LUKE! Die Woche und ich,2017-11-12,23,22:30:01,"Kinder|""ferngesteuert""|""Kinder ferngesteuert""|...",797196,53576,302,1278,https://i.ytimg.com/vi/Bayt7uQith4/default.jpg,False,False,False,Kinder ferngesteuert! Kinder lassen sich sooo ...
2,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,2017-11-13,24,07:30:00,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
3,AHtypnRk7JE,2017-11-14,Das Fermi-Paradoxon,100SekundenPhysik,2017-11-12,27,15:00:01,"Physik|""Wissenschaft""|""Technik""|""Science-Ficti...",380247,31821,458,1955,https://i.ytimg.com/vi/AHtypnRk7JE/default.jpg,False,False,False,‚ñ∫Alle Videos: http://bit.ly/1fa7Tw3\n\n\n‚úöSnap...
4,ZJ9We4bjcg0,2017-11-14,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,2017-11-12,24,13:10:36,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,18 Song Mashup √ºber den (ver√§nderten) Beat von...


## Tratando para pegar somente a ultima entrada para cada video (a entrada mais atualizada)

In [4]:
videos_lastentry = videos.sort_values('trending_date', ascending=False).drop_duplicates(['video_id'])
print(filename+" shape: "+str(videos_lastentry.shape))
videos_lastentry.head()

DEvideos.csv shape: (28331, 17)


Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
40839,go-F6xvezAM,2018-06-14,–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä - –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏—Ö –ü–µ–ª—å–º–µ–Ω–µ–π –ë - –£—Ä...,–£—Ä–∞–ª—å—Å–∫–∏–µ –ü–µ–ª—å–º–µ–Ω–∏,2018-06-13,23,15:02:15,"–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä|""—É—Ä–∞–ª—å—Å–∫–∏–µ –ø–µ–ª—å–º–µ–Ω–∏ –≥–∏—Ä–æ—Å–∫—É—Ç–µ—Ä""|""–º—è...",316328,11394,352,550,https://i.ytimg.com/vi/go-F6xvezAM/default.jpg,False,False,False,–ü–æ–ø—É–ª—è—Ä–Ω—ã–π –Ω–æ–º–µ—Ä –∏–∑ –Ω–æ–≤–æ–≥–æ —à–æ—É –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏...
40739,uFbB5VROe80,2018-06-14,Product Placement: So tricksen die Sender | Wa...,WALULIS,2018-06-11,1,14:10:38,"walulis|""product placement""|""RTL""|""Dschungelca...",167352,8666,173,776,https://i.ytimg.com/vi/uFbB5VROe80/default.jpg,False,False,False,"Wir sollten uns l√§ngst daran gew√∂hnt haben, we..."
40711,s2EkoyYN5vs,2018-06-14,"M.O.030 √ºber Bushido, den EGJ-Deal, Fake Frien...",Hiphop.de,2018-06-13,24,18:52:53,"M.O.030|""Bushido""|""EGJ""|""ersguterjunge""|""Fake ...",41447,1183,91,434,https://i.ytimg.com/vi/s2EkoyYN5vs/default.jpg,False,False,False,Als Rooz und M.O.030 sich zum ersten Mal getro...
40710,qthbOQIUybY,2018-06-14,Lumi B - Makiato,Lumi B,2018-06-12,24,15:11:00,"lumi b|""lumi b - makiato""|""lumi""|""makiato""|""ma...",378417,7807,349,481,https://i.ytimg.com/vi/qthbOQIUybY/default.jpg,False,False,False,AVD Digital - http://smarturl.it/AVDdigitalPro...
40709,i63jWjoAWHE,2018-06-14,–°–µ–º—ë–Ω –°–ª–µ–ø–∞–∫–æ–≤: –û–ª√©-–û–ª√©-–û–ª√©!,–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤,2018-06-11,23,17:13:08,"–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤|""–≥–∏—Ç–∞—Ä–∞""|""–±–∞—Ä–¥-–¥–µ—Å—è—Ç–Ω–∏–∫""|""Comed...",6597033,301185,14448,16505,https://i.ytimg.com/vi/i63jWjoAWHE/default.jpg,False,False,False,–°–µ–º–µ–Ω –≤ —Å–æ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–µ—Ç—è—Ö:FACEBOOK https://www....


## Buscando o nome da categoria pelo identificador e armazenando em nova coluna

In [5]:
with open(category_filename) as f:
    category = json.load(f)

In [6]:
def category_replace(c_id):
    for i in category["items"]:
        if int(c_id) == int(i["id"]):
             return i["snippet"]["title"]
    return "None"

In [7]:
videos_lastentry['category_name'] = videos_lastentry.apply(lambda row: category_replace(row['category_id']), axis=1)
print(filename+" shape: "+str(videos_lastentry.shape))
videos_lastentry.head()

DEvideos.csv shape: (28331, 18)


Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name
40839,go-F6xvezAM,2018-06-14,–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä - –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏—Ö –ü–µ–ª—å–º–µ–Ω–µ–π –ë - –£—Ä...,–£—Ä–∞–ª—å—Å–∫–∏–µ –ü–µ–ª—å–º–µ–Ω–∏,2018-06-13,23,15:02:15,"–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä|""—É—Ä–∞–ª—å—Å–∫–∏–µ –ø–µ–ª—å–º–µ–Ω–∏ –≥–∏—Ä–æ—Å–∫—É—Ç–µ—Ä""|""–º—è...",316328,11394,352,550,https://i.ytimg.com/vi/go-F6xvezAM/default.jpg,False,False,False,–ü–æ–ø—É–ª—è—Ä–Ω—ã–π –Ω–æ–º–µ—Ä –∏–∑ –Ω–æ–≤–æ–≥–æ —à–æ—É –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏...,Comedy
40739,uFbB5VROe80,2018-06-14,Product Placement: So tricksen die Sender | Wa...,WALULIS,2018-06-11,1,14:10:38,"walulis|""product placement""|""RTL""|""Dschungelca...",167352,8666,173,776,https://i.ytimg.com/vi/uFbB5VROe80/default.jpg,False,False,False,"Wir sollten uns l√§ngst daran gew√∂hnt haben, we...",Film & Animation
40711,s2EkoyYN5vs,2018-06-14,"M.O.030 √ºber Bushido, den EGJ-Deal, Fake Frien...",Hiphop.de,2018-06-13,24,18:52:53,"M.O.030|""Bushido""|""EGJ""|""ersguterjunge""|""Fake ...",41447,1183,91,434,https://i.ytimg.com/vi/s2EkoyYN5vs/default.jpg,False,False,False,Als Rooz und M.O.030 sich zum ersten Mal getro...,Entertainment
40710,qthbOQIUybY,2018-06-14,Lumi B - Makiato,Lumi B,2018-06-12,24,15:11:00,"lumi b|""lumi b - makiato""|""lumi""|""makiato""|""ma...",378417,7807,349,481,https://i.ytimg.com/vi/qthbOQIUybY/default.jpg,False,False,False,AVD Digital - http://smarturl.it/AVDdigitalPro...,Entertainment
40709,i63jWjoAWHE,2018-06-14,–°–µ–º—ë–Ω –°–ª–µ–ø–∞–∫–æ–≤: –û–ª√©-–û–ª√©-–û–ª√©!,–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤,2018-06-11,23,17:13:08,"–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤|""–≥–∏—Ç–∞—Ä–∞""|""–±–∞—Ä–¥-–¥–µ—Å—è—Ç–Ω–∏–∫""|""Comed...",6597033,301185,14448,16505,https://i.ytimg.com/vi/i63jWjoAWHE/default.jpg,False,False,False,–°–µ–º–µ–Ω –≤ —Å–æ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–µ—Ç—è—Ö:FACEBOOK https://www....,Comedy


## Transformando categorias em OneHotEncoder

In [8]:
# DESNECESSARIO - UTILIZADO ONEHOTENCODER() NO MAPPER
#videos_lastentry = videos_lastentry.join(pd.get_dummies(videos_lastentry['category_name']))
#videos_lastentry.head()

## Definindo sentimento por video

In [9]:
#positive sentiment = 60% da quantidade de likes+dislikes
videos_lastentry['sentiment'] = videos_lastentry['likes'] >= 0.6*(videos_lastentry['likes']+videos_lastentry['dislikes']) 
videos_lastentry.head()

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name,sentiment
40839,go-F6xvezAM,2018-06-14,–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä - –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏—Ö –ü–µ–ª—å–º–µ–Ω–µ–π –ë - –£—Ä...,–£—Ä–∞–ª—å—Å–∫–∏–µ –ü–µ–ª—å–º–µ–Ω–∏,2018-06-13,23,15:02:15,"–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä|""—É—Ä–∞–ª—å—Å–∫–∏–µ –ø–µ–ª—å–º–µ–Ω–∏ –≥–∏—Ä–æ—Å–∫—É—Ç–µ—Ä""|""–º—è...",316328,11394,352,550,https://i.ytimg.com/vi/go-F6xvezAM/default.jpg,False,False,False,–ü–æ–ø—É–ª—è—Ä–Ω—ã–π –Ω–æ–º–µ—Ä –∏–∑ –Ω–æ–≤–æ–≥–æ —à–æ—É –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏...,Comedy,True
40739,uFbB5VROe80,2018-06-14,Product Placement: So tricksen die Sender | Wa...,WALULIS,2018-06-11,1,14:10:38,"walulis|""product placement""|""RTL""|""Dschungelca...",167352,8666,173,776,https://i.ytimg.com/vi/uFbB5VROe80/default.jpg,False,False,False,"Wir sollten uns l√§ngst daran gew√∂hnt haben, we...",Film & Animation,True
40711,s2EkoyYN5vs,2018-06-14,"M.O.030 √ºber Bushido, den EGJ-Deal, Fake Frien...",Hiphop.de,2018-06-13,24,18:52:53,"M.O.030|""Bushido""|""EGJ""|""ersguterjunge""|""Fake ...",41447,1183,91,434,https://i.ytimg.com/vi/s2EkoyYN5vs/default.jpg,False,False,False,Als Rooz und M.O.030 sich zum ersten Mal getro...,Entertainment,True
40710,qthbOQIUybY,2018-06-14,Lumi B - Makiato,Lumi B,2018-06-12,24,15:11:00,"lumi b|""lumi b - makiato""|""lumi""|""makiato""|""ma...",378417,7807,349,481,https://i.ytimg.com/vi/qthbOQIUybY/default.jpg,False,False,False,AVD Digital - http://smarturl.it/AVDdigitalPro...,Entertainment,True
40709,i63jWjoAWHE,2018-06-14,–°–µ–º—ë–Ω –°–ª–µ–ø–∞–∫–æ–≤: –û–ª√©-–û–ª√©-–û–ª√©!,–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤,2018-06-11,23,17:13:08,"–°–µ–º–µ–Ω –°–ª–µ–ø–∞–∫–æ–≤|""–≥–∏—Ç–∞—Ä–∞""|""–±–∞—Ä–¥-–¥–µ—Å—è—Ç–Ω–∏–∫""|""Comed...",6597033,301185,14448,16505,https://i.ytimg.com/vi/i63jWjoAWHE/default.jpg,False,False,False,–°–µ–º–µ–Ω –≤ —Å–æ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–µ—Ç—è—Ö:FACEBOOK https://www....,Comedy,True


## Normaliza√ß√£o das features num√©ricas

In [10]:
videos_lastentry.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,28331.0,28331.0,28331.0,28331.0,28331.0
mean,20.979528,490819.9,16124.91,1047.442,2107.585
std,6.861959,1912382.0,78378.85,10997.8,13076.84
min,1.0,518.0,0.0,0.0,0.0
25%,22.0,19786.5,404.0,22.0,60.0
50%,24.0,90182.0,1978.0,99.0,284.0
75%,24.0,392235.5,8651.0,416.0,1094.0
max,44.0,113876200.0,4924056.0,1470386.0,1084435.0


In [11]:
# Normaliza√ß√£o
videos_lastentry['views_nonnormal'] = videos_lastentry['views']
videos_lastentry['views'] = (videos_lastentry['views']-videos_lastentry['views'].min())/(videos_lastentry['views'].max()-videos_lastentry['views'].min())
#videos_lastentry['likes'] = (videos_lastentry['likes']-videos_lastentry['likes'].min())/(videos_lastentry['likes'].max()-videos_lastentry['likes'].min())
#videos_lastentry['dislikes'] = (videos_lastentry['dislikes']-videos_lastentry['dislikes'].min())/(videos_lastentry['dislikes'].max()-videos_lastentry['dislikes'].min())
videos_lastentry['comment_count'] = (videos_lastentry['comment_count']-videos_lastentry['comment_count'].min())/(videos_lastentry['comment_count'].max()-videos_lastentry['comment_count'].min())
videos_lastentry.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count,views_nonnormal
count,28331.0,28331.0,28331.0,28331.0,28331.0,28331.0
mean,20.979528,0.004306,16124.91,1047.442,0.001943,490819.9
std,6.861959,0.016794,78378.85,10997.8,0.012059,1912382.0
min,1.0,0.0,0.0,0.0,0.0,518.0
25%,22.0,0.000169,404.0,22.0,5.5e-05,19786.5
50%,24.0,0.000787,1978.0,99.0,0.000262,90182.0
75%,24.0,0.00344,8651.0,416.0,0.001009,392235.5
max,44.0,1.0,4924056.0,1470386.0,1.0,113876200.0


## Separando conjuntos de treino, valida√ß√£o e teste

In [12]:
train_data = videos_lastentry.sample(frac=0.8, random_state=200)
test_data = videos_lastentry.drop(train_data.index)
print(train_data.shape, test_data.shape)

(22665, 20) (5666, 20)


# Classificador

## Uni√£o de features

In [13]:
mapper = DataFrameMapper([
    ('title', CountVectorizer(max_features = 2000)),
    ('description', CountVectorizer(max_features = 2000)),
    ('tags', CountVectorizer(max_features = 2000)),
    ('category_name', LabelBinarizer()),
    ('comments_disabled', None),
    ('views', None),
    ('comment_count', None)
])
x_train = mapper.fit_transform(train_data)
y_train = train_data['sentiment'].values

In [14]:
x_train.shape

(22665, 6021)

In [15]:
classifier = LogisticRegression(penalty = 'l1')
classifier.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
x_test = mapper.transform(test_data)
y_test = test_data['sentiment'].values
predicted = classifier.predict(x_test)

In [17]:
probs = classifier.predict_proba(x_test)
print(probs)

[[4.08541978e-03 9.95914580e-01]
 [4.39578796e-03 9.95604212e-01]
 [1.60067515e-11 1.00000000e+00]
 ...
 [1.99485657e-02 9.80051434e-01]
 [2.42894891e-02 9.75710511e-01]
 [1.17800576e-02 9.88219942e-01]]


In [18]:
print("Acuracia: "+str(metrics.accuracy_score(y_test, predicted)))
print("ROC AUC score: "+str(metrics.roc_auc_score(y_test, probs[:, 1])))

Acuracia: 0.9745852453229792
ROC AUC score: 0.8441815089374539


In [19]:
predicted = classifier.predict(mapper.transform(videos_lastentry))

In [20]:
videos_output = videos_lastentry
videos_output['predicted_sentiment'] = predicted

videos_output = videos_output.sort_values('predicted_sentiment', ascending=False)
videos_output.head()

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,...,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name,sentiment,views_nonnormal,predicted_sentiment
40839,go-F6xvezAM,2018-06-14,–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä - –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏—Ö –ü–µ–ª—å–º–µ–Ω–µ–π –ë - –£—Ä...,–£—Ä–∞–ª—å—Å–∫–∏–µ –ü–µ–ª—å–º–µ–Ω–∏,2018-06-13,23,15:02:15,"–ì–∏—Ä–æ—Å–∫—É—Ç–µ—Ä|""—É—Ä–∞–ª—å—Å–∫–∏–µ –ø–µ–ª—å–º–µ–Ω–∏ –≥–∏—Ä–æ—Å–∫—É—Ç–µ—Ä""|""–º—è...",0.002773,11394,...,0.000507,https://i.ytimg.com/vi/go-F6xvezAM/default.jpg,False,False,False,–ü–æ–ø—É–ª—è—Ä–Ω—ã–π –Ω–æ–º–µ—Ä –∏–∑ –Ω–æ–≤–æ–≥–æ —à–æ—É –ê–∑–±—É–∫–∞ –£—Ä–∞–ª—å—Å–∫–∏...,Comedy,True,316328,True
13675,9VfUrlaC-gA,2018-01-23,–°–≤–µ–∂–∞–π—à–∞—è –ø—Ä–µ–º—å–µ—Ä–∞ 2018 [ –®–ö–û–õ–¨–ù–ê–Ø –ñ–ï–ù–ê ] –†—É—Å—Å...,–õ–£–ß–®–ï–ï 2017,2018-01-22,22,07:42:00,"–ù–û–í–ò–ù–ö–ò 2018|""–º–µ–ª–æ–¥—Ä–∞–º—ã –Ω–æ–≤–∏–Ω–∫–∏ 2018""|""–º–µ–ª–æ–¥—Ä–∞...",0.001072,271,...,2.6e-05,https://i.ytimg.com/vi/9VfUrlaC-gA/default.jpg,False,False,False,#–°–≤–µ–∂–∞–π—à–∞—è –ø—Ä–µ–º—å–µ—Ä–∞ 2018 [ –®–ö–û–õ–¨–ù–ê–Ø –ñ–ï–ù–ê ] –†—É—Å...,People & Blogs,True,122546,True
13639,DyXvOwcE8fs,2018-01-23,"Au√üerordentlicher Bundesparteitag 2018, Bonn",SPD,2018-01-21,25,16:39:26,"SPD|""Martin Schulz""|""Parteitag""|""Bundesparteit...",0.000308,272,...,4.1e-05,https://i.ytimg.com/vi/DyXvOwcE8fs/default.jpg,False,False,False,Macht der SPD-Bundesparteitag den Weg frei f√ºr...,News & Politics,False,35581,True
13641,90XVTC9uuBU,2018-01-23,"Next Level | DEUTSCHRAP MASHUP 2018 | (Azet, R...",KsFreakWhatElse,2018-01-20,24,18:04:46,"ks|""danergy""|""ksfreak""|""olexesh mob""|""bausa fe...",0.008721,97910,...,0.008335,https://i.ytimg.com/vi/90XVTC9uuBU/default.jpg,False,False,False,Wir sind nun endlich im Jahr 2018 angekommen! ...,Entertainment,True,993653,True
13632,QWQi7csx83w,2018-01-23,Mutfaƒüƒ±m - Mutfak D√ºzenim - Mutfaƒüƒ±mda Neler V...,Leyla ile Yemek Saati,2018-01-22,22,13:06:57,"Tarifler|""ana yemekler""|""yemekler""|""bug√ºn ne p...",0.000199,685,...,0.000157,https://i.ytimg.com/vi/QWQi7csx83w/default.jpg,False,False,False,SAYFALARIMIZ\n\nƒ∞nternet sitemiz : www.leylail...,People & Blogs,True,23157,True


In [21]:
f1_score(videos_output['sentiment'].astype(int), videos_output['predicted_sentiment'].astype(int), average='macro')

0.764668529643479

In [22]:
get_n_features = 10
print("The "+str(get_n_features)+" most positive-weighted words are: ")
for feature_id in sorted(range(len(classifier.coef_[0])), key=lambda i: classifier.coef_[0][i], reverse=True)[:get_n_features]:
    print("%s (%f)" % (mapper.transformed_names_[feature_id], classifier.coef_[0][feature_id]))
print()
print("The "+str(get_n_features)+" most negative-weighted words are: ")
for feature_id in sorted(range(len(classifier.coef_[0])), key=lambda i: classifier.coef_[0][i], reverse=False)[:get_n_features]:
    print("%s (%f)" % (mapper.transformed_names_[feature_id], classifier.coef_[0][feature_id]))

The 10 most positive-weighted words are: 
description_out (2.448390)
title_26 (2.014492)
tags_liebe (2.007110)
description_jeden (1.884737)
description_olmak (1.863164)
description_35 (1.614808)
description_dort (1.474410)
title_ÿ®€å (1.451132)
description_web (1.449048)
description_ÿ®ÿ±ŸÜÿßŸÖŸá (1.436796)

The 10 most negative-weighted words are: 
title_israel (-3.536757)
title_anne (-3.479515)
title_schumacher (-2.770111)
title_h√∂cke (-2.761517)
title_bieber (-2.400074)
title_ÿÆÿ®ÿ±€å (-2.331821)
title_–¥–µ–ª–µ (-2.329725)
title_helene (-2.117793)
title_eigentlich (-2.073846)
title_fortnite (-2.029657)


# K-Fold

In [23]:
def run_kfold(clf,X,Y):
    kf = KFold(n_splits=10)
    outcomes = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

In [24]:
run_kfold(classifier,x_train,y_train)



Mean Accuracy: 0.975071510303051


In [25]:
videos_lastentry['like_ratio'] = videos_lastentry['likes']/(videos_lastentry['likes']+videos_lastentry['dislikes'])

In [26]:
from IPython.display import HTML, display

# We choose the 10 most trending videos
selected_columns = ['title', 'channel_title', 'thumbnail_link', 'publish_date', 'category_name', 'likes', 'dislikes', 'views_nonnormal', 'like_ratio']

most_frequent = videos_lastentry.groupby(selected_columns)['video_id'].agg(
    {"code_count": len}).sort_values(by=['like_ratio']
).head(10).reset_index()

# Construction of HTML table with miniature photos assigned to the most popular movies
table_content = ''
max_title_length = 50

for date, row in most_frequent.T.iteritems():
    HTML_row = '<tr>'
    HTML_row += '<td><img src="' + str(row[2]) + '"style="width:100px;height:100px;"></td>'
    HTML_row += '<td>' + str(row[1]) + '</td>'
    HTML_row += '<td>' + str(row[0])  + '</td>'
    HTML_row += '<td>' + str(row[4]) + '</td>'
    HTML_row += '<td>' + str(row[3]) + '</td>'
    HTML_row += '<td>' + str(row[5]) + '</td>'
    HTML_row += '<td>' + str(row[6]) + '</td>'
    HTML_row += '<td>' + str(row[7]) + '</td>'
    
    table_content += HTML_row + '</tr>'

display(HTML(
    '<table><tr><th>Photo</th><th>Channel Name</th><th style="width:250px;">Title</th><th>Category</th><th>Publish Date</th><th>Likes</th><th>Dislikes</th><th>Views</th></tr>{}</table>'.format(table_content))
)

is deprecated and will be removed in a future version
  import sys


Photo,Channel Name,Title,Category,Publish Date,Likes,Dislikes,Views
,Cool Mobile,WOW üò± Neueste Smartphone Case Erfindung Samsung / iPhone,People & Blogs,2017-11-15 00:00:00,0,3,19121
,HINSTORES channel,Download and Fix ISDone.dll Error,Gaming,2017-07-03 00:00:00,0,1,2408
,Vinist News,Michael Schumacher: Dramatisches Statement von seiner Frau!,Entertainment,2017-11-12 00:00:00,51,1246,32090
,Washington Post,The FCC repeals its net neutrality rules,News & Politics,2017-12-14 00:00:00,4870,110709,985179
,promiflash news,Michael Schumacher: Diese Nachricht h√§lt die Welt in Atem!,Entertainment,2017-12-18 00:00:00,46,943,39063
,MasterChef Greece,MasterChefGR ‚Äì 2ŒøœÇ ŒöœçŒ∫ŒªŒøœÇ ‚Äì ŒïœÄŒµŒπœÉœåŒ¥ŒπŒø 33,People & Blogs,2018-02-21 00:00:00,259,4685,59065
,focus nachrichten,Florian Silbereisen: Drogen-Skandal!,News & Politics,2017-12-21 00:00:00,32,528,23941
,Promi News,Michael Schumacher: Das wunder der weihnacht - endlich zur√ºck!,Entertainment,2017-12-20 00:00:00,1515,21958,588667
,focus nachrichten,Herzogin Kate: Traurige Trennung in der Schwangerschaft,News & Politics,2018-01-03 00:00:00,20,271,35905
,BILD,Traut sich Kollegah nicht mehr alleine raus? | FIBO 2018,News & Politics,2018-04-16 00:00:00,2216,28975,437653


In [27]:
videos_lastentry.loc[videos_lastentry['description'].str.contains("andretti")]

Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,...,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category_name,sentiment,views_nonnormal,predicted_sentiment,like_ratio
