In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import nltk
import seaborn as sns
from sklearn import ensemble
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
videos = pd.read_csv('USvideos.csv')

In [3]:
# creating new features: ratio of likes, total votes, very well received or not
# and title + description (text)
videos['ratio'] = videos['likes'] / (videos['likes'] + videos['dislikes'])
videos['votes'] = videos['likes'] + videos['dislikes']
videos['vwr'] = np.where(videos['ratio']>=.9,1,0)
videos['text'] = videos['title'] + [' '] + videos['description']

In [4]:
# removing videos with lower amounts of votes
videos['likes'] = videos[videos['votes']>99]

In [5]:
videos.isnull().sum()

video_id                    0
trending_date               0
title                       0
channel_title               0
category_id                 0
publish_time                0
tags                        0
views                       0
likes                     989
dislikes                    0
comment_count               0
thumbnail_link              0
comments_disabled           0
ratings_disabled            0
video_error_or_removed      0
description               523
ratio                     170
votes                       0
vwr                         0
text                      523
dtype: int64

In [6]:
# not a very significant amount of missing data, we'll drop it
videos = videos.dropna()

In [7]:
videos.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,ratio,votes,vwr,text
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,2kyS6SvSYSE,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,0.95097,60493,1,WE WANT TO TALK ABOUT OUR MARRIAGE SHANTELL'S ...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,1ZAPwfrtAFY,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",0.940521,103331,1,The Trump Presidency: Last Week Tonight with J...
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,5qpjK5DgCt4,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,0.964729,151372,1,"Racist Superman | Rudy Mancuso, King Bach & Le..."
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,puqaWrEC7tY,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,0.93855,10838,1,Nickelback Lyrics: Real or Fake? Today we find...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,d380meD0W0M,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,0.985181,134224,1,I Dare You: GOING BALD!? I know it's been a wh...


In [8]:
videos.describe()

Unnamed: 0,category_id,views,dislikes,comment_count,ratio,votes,vwr
count,35131.0,35131.0,35131.0,35131.0,35131.0,35131.0,35131.0
mean,20.073696,2040962.0,3045.738,7350.666,0.935176,68993.27,0.841508
std,7.532392,6315747.0,23108.92,29035.79,0.099495,210419.3,0.365207
min,1.0,2413.0,0.0,0.0,0.039923,100.0,0.0
25%,17.0,232647.0,198.0,611.0,0.933144,5917.0,1.0
50%,24.0,616077.0,587.0,1740.0,0.967418,17788.0,1.0
75%,25.0,1612152.0,1730.0,5040.5,0.983209,51221.5,1.0
max,43.0,173478100.0,1643059.0,1114809.0,1.0,5219182.0,1.0


In [9]:
# define X and y
X = videos['description']
y = videos['vwr']
print ((y.value_counts()))

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

1    29563
0     5568
Name: vwr, dtype: int64


In [10]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [11]:
# rows are documents, columns are terms (phrases) (aka "tokens" or "features")
print (X_train_dtm.shape)
print (X_test_dtm.shape)

(26348, 62353)
(8783, 62353)


In [13]:
logreg = LogisticRegression()

print ('Features: ', X_train_dtm.shape[1])
print ('Accuracy: ', cross_val_score(logreg, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Features:  62353
Accuracy:  0.9681188703267436


In [14]:
rfc = ensemble.RandomForestClassifier()

print ('Accuracy: ', cross_val_score(rfc, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Accuracy:  0.9780248149385408


In [12]:
clf = ensemble.GradientBoostingClassifier()

print ('Accuracy: ', cross_val_score(clf, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Accuracy:  0.8697814882800714


In [16]:
vect_tfidf = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half of texts
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #adds 1 to all document frequencies
                            )
X_train_dtm = vect_tfidf.fit_transform(X_train)

In [21]:
print ('Features: ', X_train_dtm.shape[1])
print ('Accuracy: ', cross_val_score(logreg, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Features:  57933
Accuracy:  0.9196901560740663


In [22]:
print ('Accuracy: ', cross_val_score(rfc, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Accuracy:  0.9775313914756174


In [23]:
print ('Accuracy: ', cross_val_score(clf, X_train_dtm, y_train, cv=5, scoring='accuracy').mean())

Accuracy:  0.8736906228388704
