In [65]:
import pandas as pd
import numpy as np
import requests
import re
import json
import string 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [66]:
data = pd.read_csv('../data/mbti_1.csv')

In [67]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [52]:
item_0 = data.posts.iloc[0].split('|||')
len(item_0)

50

In [53]:
item_0[:3]

["'http://www.youtube.com/watch?v=qsXHcwe3krw",
 'http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg',
 'enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks']

In [73]:
# clean youtube links
def clean_yt_link(link):
    # so far just remove ' and " from youtube links because we saw that as an issue
    link = link.replace("\'", '')
    link = link.replace('\"', '')
    return link

In [74]:
# scrape youtube title
def get_yt_title(url, add_description=False):
    ret_string = ''
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    if not soup:
        return ret_string
    
    if soup.find('title'):
        # get title from the <title> tag
        ret_string += soup.find('title').string

    if add_description:
        # use regex to find the description from a text version of the html
        html = response.text
        # description seems to be between "shortDescription": and ,"isCrawlable"
        # so we use regex to get the text between those two strings
        desc = re.findall(r'"shortDescription":(.+),"isCrawlable"', html, re.DOTALL)
        if desc:
            desc = desc[0].replace('\\n', ' ')
            ret_string += desc

In [75]:
def replace_youtube(posts, add_description=False):
    # split the posts column into 50 separate posts in a list
    posts_split = posts.split('|||')
    
    # split each of the 50 posts into a list of its words/URL links
    # so we have a list of 50 posts and each post is a list of the words/URL links
    posts_split = [x.split(' ') for x in posts_split]
    
    # loop through each word/URL link in and replace youtube links with youtube video titles
    return_list = []
    for item in posts_split:
        sentence = []
        for word in item:
            if 'http://www.youtube.com/watch?v=' in word or 'https://www.youtube.com/watch?v=' in word:
                sentence.append(get_yt_title(clean_yt_link(word), add_description))
            else:
                sentence.append(word)
        return_list.append(sentence)
        
    # join the words/youtube titles of each post so we are back to a list with 50 posts
    # this is necessary because otherwise some items are multiple words (the titles),
    # while the other items are still just single words
    return_list = [' '.join(thing) for thing in return_list]
    
    return return_list

In [76]:
def remove_links(post_split):
    # split each post into a list of individual words
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [77]:
def preprocess(post, add_description=False):
    #replace youtube links with youtube title
    #return list of 50 posts
    post_split = replace_youtube(post, add_description)
    
    #removes any 'words' that have http:// or https:// in them
    #returns a list of posts if they are not empty after removing the links
    #return list of <= 50 posts
    post_split = remove_links(post_split)
    
    remove_punc = string.punctuation + '►•'
    #remove punc and lower
    for punctuation in remove_punc:
        for i, item in enumerate(post_split):
            post_split[i] = item.replace(punctuation, '').lower()
            
    #remove soft hyphens       
    for i, item in enumerate(post_split):
        post_split[i] = item.replace('\xad', '').lower()
        
    #remove numbers
    for i, item in enumerate(post_split):
        post_split[i] = ''.join(word for word in item if not word.isdigit())
    
    #remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(item) for item in post_split]
    post_split = [[word for word in sentence if word not in stop_words] for sentence in word_tokens]
    
    #lemmatize if not empty sentence
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in sentence] for sentence in post_split if sentence]

In [81]:
data

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [82]:
data['posts'].apply(lambda x: preprocess(x))

KeyboardInterrupt: 