## EDA with NLP method

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from pymongo import MongoClient 

### Make a dataframe from mongodb

**Collections**
* video_detail : contains title, url, published date, video_id of video
* view_count : contains # of views, comments, likes
* comments : contatins comments for video


In [2]:
# connect mongodb through ssh tunnel
connection = MongoClient(port=47017)
db = connection['youtube_scrap']

### Load video detail to dataframe

In [3]:
# load from mongodb
video_coll = db['video_detail']
video_cur = video_coll.find({})
videos = [video for video in video_cur]

In [4]:
# make a dataframe
video_df = pd.DataFrame(videos)
video_df.head()

Unnamed: 0,_id,pubished,title,url,video_id
0,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98
1,5d13aedee70a248460e58340,"Premiered Apr 4, 2019",BLACKPINK - 'Kill This Love' M/V,https://www.youtube.com/watch?v=2S24-y0Ij3Y,2S24-y0Ij3Y
2,5d13aee5e70a248460e58341,"Published on Apr 12, 2019",BTS (방탄소년단) '작은 것들을 위한 시 (Boy With Luv) feat. ...,https://www.youtube.com/watch?v=XsX3ATc3FbA,XsX3ATc3FbA
3,5d13aeebe70a248460e58342,"Published on Jun 19, 2019","Stray Kids ""부작용(Side Effects)"" M/V",https://www.youtube.com/watch?v=5rPluw_-Eb4,5rPluw_-Eb4
4,5d13aef1e70a248460e58343,"Published on Jun 13, 2019",SOMI (전소미) - 'BIRTHDAY' M/V,https://www.youtube.com/watch?v=oDJ4ct59NC4,oDJ4ct59NC4


### Load comments to dataframe

In [5]:
# load from mongodb
comments_coll = db['comments']
comments_cur = comments_coll.find({})
comments = [comment for comment in comments_cur]

In [6]:
# make a dataframe
comments_df = pd.DataFrame(comments)
comments_df.head()

Unnamed: 0,_id,comment,video_id
0,5d13bbade70a248460e583ee,How many international fans are here ?\n\n\n\n...,YBnGBb1wg98
1,5d13bbaee70a248460e583ef,When the kpop world is influenzed by western s...,YBnGBb1wg98
2,5d13bbaee70a248460e583f0,🎢Zimzalabim🎢\nFull Daily Views (6pm KST)\n\nDa...,YBnGBb1wg98
3,5d13bbaee70a248460e583f1,26M yayyy..!! The only one kpop i STAN RED VEL...,YBnGBb1wg98
4,5d13bbaee70a248460e583f2,Some k-company: make their artist lipsinc to f...,YBnGBb1wg98


In [7]:
temp = pd.merge(video_df, comments_df, on='video_id')

In [8]:
temp.head()

Unnamed: 0,_id_x,pubished,title,url,video_id,_id_y,comment
0,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbade70a248460e583ee,How many international fans are here ?\n\n\n\n...
1,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583ef,When the kpop world is influenzed by western s...
2,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f0,🎢Zimzalabim🎢\nFull Daily Views (6pm KST)\n\nDa...
3,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f1,26M yayyy..!! The only one kpop i STAN RED VEL...
4,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f2,Some k-company: make their artist lipsinc to f...


### Load view_count to dataframe

In [9]:
view_count_coll = db['view_count']
view_count_cur = view_count_coll.find({})
view_count_cur.next()

{'_id': ObjectId('5d13b6a99ec6103a9817d745'),
 'title': '모모랜드(MOMOLAND) - 바나나차차(뽀로로 삽입곡) M/V',
 'view_count': 502364,
 'comment_count': 1530,
 'like_count': 25277,
 'timestamp': datetime.datetime(2019, 6, 26, 18, 17, 9, 330000)}

In [10]:
# single video has a lot of view counts by time
# need to get max value by title (need to change to video_id)
view_max_count =[row for row in view_count_coll.aggregate([
    {"$group" : {'_id':"$title", 'view_count':{"$max":"$view_count"}}}
])]

In [11]:
view_count_df = pd.DataFrame(view_max_count)
view_count_df.columns = ['title', 'view_count']

In [12]:
# merge with temp dataframe
df = pd.merge(temp, view_count_df, on='title')
df.head()

Unnamed: 0,_id_x,pubished,title,url,video_id,_id_y,comment,view_count
0,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbade70a248460e583ee,How many international fans are here ?\n\n\n\n...,26850555
1,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583ef,When the kpop world is influenzed by western s...,26850555
2,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f0,🎢Zimzalabim🎢\nFull Daily Views (6pm KST)\n\nDa...,26850555
3,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f1,26M yayyy..!! The only one kpop i STAN RED VEL...,26850555
4,5d13aed7e70a248460e5833f,"Premiered Jun 19, 2019",Red Velvet 레드벨벳 '짐살라빔 (Zimzalabim)' MV,https://www.youtube.com/watch?v=YBnGBb1wg98,YBnGBb1wg98,5d13bbaee70a248460e583f2,Some k-company: make their artist lipsinc to f...,26850555


## First spaghetti model

### Make a simple model with basic TFIDF & Linear Regression

In [13]:
# import prerequisties
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline


In [14]:
X = df['comment']

In [15]:
y = df['view_count']

In [16]:
# Parameters for TF-IDF vectorize
ngrams_1 = {
    'ngram_range': (1, 1),
    'stop_words': 'english',
    'dtype': 'int32',
    'strip_accents': 'unicode',
    'decode_error': 'replace',
    'analyzer': 'word',
    'min_df': 2
}

In [17]:
# Pipeline TFIDF & LinearRegression
pp_first = Pipeline([
    ('tfidf', TfidfVectorizer(ngrams_1)),
    ('classifier', LinearRegression())
])

In [18]:
# Split train & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
pp_first.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8',
        input={'ngram_range': (1, 1), 'stop_words': 'english', 'dtype': 'int32', 'strip_accents': 'unicode', 'decode_error': 'replace', 'analyzer': 'word', ...ssifier', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [20]:
pp_first.score(X_train, y_train)

0.8629085093276447

In [21]:
pp_first.score(X_test, y_test)

-141.0434637063376

**Too Bad Score, find another way**

In [31]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
# opinion = TextBlob("EliteDataScience.com is dope!", analyzer=NaiveBayesAnalyzer())
# opinion.sentiment
from datetime import datetime

In [34]:
sentiment = TextBlob(comments_df.loc[0, 'comment'], analyzer=NaiveBayesAnalyzer()).sentiment
sentiment
    

Sentiment(classification='pos', p_pos=0.7865389082121107, p_neg=0.21346109178788916)