List of modules

In [1]:
import json

import pickle

from googleapiclient.discovery import build

from operator import itemgetter

import os
from os import listdir
from os.path import isfile, join

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

Read video URL

In [2]:
def readdata():
    
    datapath = os.getcwd() +"\\videos"
    onlyfiles = [f for f in listdir(datapath) if isfile(join(datapath, f))]

    return onlyfiles

Read personality label

In [3]:
def readdatapersonality():
    
    labelpath = os.getcwd() + "\\annotation_training.pkl"
    with open(labelpath, 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'latin1'
        labeldata = u.load()
    
    return labeldata

YouTube function

In [4]:
def youtube_build():

    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

    return youtube

Extract video's comments

In [5]:
def get_video_comments(service, **kwargs):
    
    comments = []
    usernames = []
    dates = []
    likes = []
    
    try:
        results = service.commentThreads().list(**kwargs).execute()
    except Exception:
        return False
 
    while results:
        if len(comments) == 300:
            break
        
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
            username = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            usernames.append(username)
            date = item['snippet']['topLevelComment']['snippet']['publishedAt']
            dates.append(date)
            like = item['snippet']['topLevelComment']['snippet']['likeCount']
            likes.append(like)
 
        if 'nextPageToken' in results:
            kwargs['pageToken'] = results['nextPageToken']
            results = service.commentThreads().list(**kwargs).execute()
        else:
            break
    
    if len(comments) == 0:
        return False
            
    d = {'Comment':comments,'Username':usernames,'Date':dates,'Like':likes}
    
    return d

Get personalities value for each video

In [6]:
def get_personalities(vid, lab):

    d = {'Agreeableness':lab['agreeableness'][vid], 'Conscientiousness':lab['conscientiousness'][vid],
         'Extraversion':lab['extraversion'][vid], 'Neuroticism':lab['neuroticism'][vid],
         'Openness':lab['openness'][vid]}
    
    return d

Combine comment and personality data

In [7]:
def extractdata(listvideo, listlabel):
    
    cmlist = []
    vidchecker = '';
    
    for video in listvideo:
        if video[:-8] == vidchecker:
            continue
        vidchecker = video[:-8]
        comment_data = get_video_comments(youtube_build(), part='snippet', videoId=video[:-8], textFormat='plainText')
        person_data = get_personalities(video, listlabel)
        if comment_data == False:
            continue
        d = {'Video':video[:-8],'Personality':person_data, 'Data':comment_data}
        cmlist.append(d)
    
    return cmlist

Write and save combined data in JSON file

In [8]:
def writefile(cmlist):
    
    with open('ssp2.json', 'a') as fp: #your file may be saved in here (print this) -> os.getcwd()
        json.dump(cmlist, fp)
    fp.close()
    
    f = open('ssp2.json','r')
    old_data = f.read()
    f.close()
    
    new_data = old_data.replace("][", ",")
    
    f = open('ssp2_new.json','w')
    f.write(new_data)
    f.close()
    
    os.remove('ssp2.json') 
    os.rename('ssp2_new.json', 'ssp2.json')  

Read JSON file

In [9]:
def readfile():
    with open('ssp2.json', 'r') as fp: 
        complistnew = json.load(fp)

    cmlistnew = sorted(complistnew, key=itemgetter('Video'))
    
    return cmlistnew

Extraction code

In [10]:
DEVELOPER_KEY = "AIzaSyDp4gj0bM-e0qAH0EA6VdGDUQSt2Pnk-B4"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

list_of_videonames = readdata()
list_of_labels = readdatapersonality()

complist = extractdata(list_of_videonames, list_of_labels)

writefile(complist)

complistnew = readfile()

Train and test initialization

In [None]:
comment_data_train = []
label_data_train = [[] for i in range(5)]
comment_data_test = []
label_data_test = [[] for i in range(5)]
category = ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness']

Create train data

In [None]:
for viddata in complistnew:
    tempcom = ''
    templist = []
    for comdata in viddata['Data']['Comment']:
        if tempcom == '':
            tempcom = comdata
        else:
            tempcom = tempcom + " " + comdata
    comment_data_train.append(tempcom)
    label_data_train[0].append(determineclass(viddata['Personality']['Agreeableness'], 0))
    label_data_train[1].append(determineclass(viddata['Personality']['Conscientiousness'], 1))
    label_data_train[2].append(determineclass(viddata['Personality']['Extraversion'], 2))
    label_data_train[3].append(determineclass(viddata['Personality']['Neuroticism'], 3))
    label_data_train[4].append(determineclass(viddata['Personality']['Openness'], 4))

Create test data

In [None]:
for viddata in complistnew2:
    tempcom = ''
    for comdata in viddata['Data']['Comment']:
        if tempcom == '':
            tempcom = comdata
        else:
            tempcom = tempcom + " " + comdata
    comment_data_test.append(tempcom)
    label_data_test[0].append(determineclass(viddata['Personality']['Agreeableness'], 0))
    label_data_test[1].append(determineclass(viddata['Personality']['Conscientiousness'], 1))
    label_data_test[2].append(determineclass(viddata['Personality']['Extraversion'], 2))
    label_data_test[3].append(determineclass(viddata['Personality']['Neuroticism'], 3))
    label_data_test[4].append(determineclass(viddata['Personality']['Openness'], 4))

Pipeline Count

In [None]:
NB_pipeline_cnt = Pipeline([
                ('count', CountVectorizer(lowercase=True, ngram_range=(1,2), stop_words='english')),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
                        ])

Check Performance Count

In [None]:
count = 0
for cat in category:
    print('... Processing {}'.format(cat))
    NB_pipeline_cnt.fit(comment_data_train, label_data_train[count])
    # compute the testing accuracy
    prediction = NB_pipeline_cnt.predict(comment_data_test)
    print('Test accuracy is {}'.format(accuracy_score(label_data_test[count], prediction)))

Pipeline TfIdf

In [None]:
NB_pipeline_tid = Pipeline([
                ('count', TfIdfVectorizer(lowercase=True, ngram_range=(1,2), stop_words='english')),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
                        ])

Check Performance TfIdf

In [None]:
count = 0
for cat in category:
    print('... Processing {}'.format(cat))
    NB_pipeline_tid.fit(comment_data_train, label_data_train[count])
    # compute the testing accuracy
    prediction = NB_pipeline_tid.predict(comment_data_test)
    print('Test accuracy is {}'.format(accuracy_score(label_data_test[count], prediction)))