In [68]:
import pandas as pd
import numpy as np
import requests
import re
import json
import string 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [69]:
data = pd.read_csv('../data/mbti_1.csv')

In [70]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [71]:
item_0 = data.posts.iloc[0].split('|||')
len(item_0)

50

In [72]:
item_0[:3]

["'http://www.youtube.com/watch?v=qsXHcwe3krw",
 'http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg',
 'enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks']

In [73]:
def split_post(post):
    # split the kaggle data set posts by |||
    return post.split('|||')

In [75]:
# clean youtube links
def clean_yt_link(link):
    # so far just remove ' and " from youtube links because we saw that as an issue
    link = link.replace("\'", '')
    link = link.replace('\"', '')
    return link

In [76]:
# scrape youtube title
def get_yt_title(url, add_description=False):
    ret_string = ''
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    if not soup:
        return ret_string
    
    if soup.find('title'):
        # get title from the <title> tag
        ret_string += soup.find('title').string

    if add_description:
        # use regex to find the description from a text version of the html
        html = response.text
        # description seems to be between "shortDescription": and ,"isCrawlable"
        # so we use regex to get the text between those two strings
        desc = re.findall(r'"shortDescription":(.+),"isCrawlable"', html, re.DOTALL)
        if desc:
            desc = desc[0].replace('\\n', ' ')
            ret_string += desc
            
    return ret_string

In [77]:
def replace_youtube(post_split, add_description=False):
    # split each of the 50 posts into a list of its words/URL links
    # so we have a list of 50 posts and each post is a list of the words/URL links
    post_split_split = [x.split(' ') for x in post_split]
    
    # loop through each word/URL link in and replace youtube links with youtube video titles
    return_list = []
    for item in post_split_split:
        sentence = []
        for word in item:
            if 'http://www.youtube.com/watch?v=' in word or 'https://www.youtube.com/watch?v=' in word:
                sentence.append(get_yt_title(clean_yt_link(word), add_description))
            else:
                sentence.append(word)
        return_list.append(sentence)
        
    # join the words/youtube titles of each post so we are back to a list with 50 posts
    # this is necessary because otherwise some items are multiple words (the titles),
    # while the other items are still just single words
    return_list = [' '.join(thing) for thing in return_list]
    
    return return_list

In [78]:
def remove_links(post_split):
    # split each post into a list of individual words
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [79]:
def combine_text(post_split_split):
    #takes in a list of sentences where each sentence is a list of its words
    #and returns the one string where each word is seperated by a space and each sentence separated by |||
    return '|||'.join([' '.join(sentence) for sentence in post_split_split])

In [80]:
def preprocess(post_split, get_youtube=False, add_description=False):
    #replace youtube links with youtube title
    #return list of 50 posts
    if get_youtube:
        post_split = replace_youtube(post_split, add_description)
    
    #removes any 'words' that have http:// or https:// in them
    #returns a list of posts if they are not empty after removing the links
    #return list of <= 50 posts
    post_split = remove_links(post_split)
    
    remove_punc = string.punctuation + '►•'
    #remove punc and lower
    for punctuation in remove_punc:
        for i, item in enumerate(post_split):
            post_split[i] = item.replace(punctuation, '').lower()
            
    #remove soft hyphens       
    for i, item in enumerate(post_split):
        post_split[i] = item.replace('\xad', '').lower()
        
    #remove numbers
    for i, item in enumerate(post_split):
        post_split[i] = ''.join(word for word in item if not word.isdigit())
    
    #remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(item) for item in post_split]
    post_split = [[word for word in sentence if word not in stop_words] for sentence in word_tokens]
    
    #lemmatize if not empty sentence
    lemmatizer = WordNetLemmatizer()
    post_split_split = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in post_split if sentence]
    
    #combine text into one string where each word is seperated by a space and each sentence separated by |||
    return combine_text(post_split_split)

In [84]:
preprocess(data.posts.iloc[0].split('|||'))

'enfp intj moment sportscenter top ten play prank|||lifechanging experience life|||repeat today|||may perc experience immerse|||last thing infj friend posted facebook committing suicide next day rest peace|||hello enfj sorry hear distress natural relationship perfection time every moment existence try figure hard time time growth|||welcome stuff|||game set match|||prozac wellbrutin least thirty minute moving leg dont mean moving sitting desk chair weed moderation maybe try edible healthier alternative|||basically come three item youve determined type whichever type want would likely use given type cognitive function whatnot left|||thing moderation sims indeed video game good one note good one somewhat subjective completely promoting death given sim|||dear enfp favorite video game growing current favorite video game cool|||appears late sad|||there someone everyone|||wait thought confidence good thing|||cherish time solitude bc revel within inner world whereas time id workin enjoy time d

In [85]:
data['posts_split'] = data.posts.apply(split_post)

In [88]:
data['linkfree_combined'] = data.posts_split.apply(preprocess)

In [90]:
data.head()

Unnamed: 0,type,posts,posts_split,linkfree_combined
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",enfp intj moment sportscenter top ten play pra...
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,im finding lack post alarming|||sex boring pos...
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,good one|||course say know thats blessing curs...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",dear intp enjoyed conversation day esoteric ga...
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",youre fired|||thats another silly misconceptio...


In [91]:
df = data[['type', 'linkfree_combined']]

In [92]:
df.head()

Unnamed: 0,type,linkfree_combined
0,INFJ,enfp intj moment sportscenter top ten play pra...
1,ENTP,im finding lack post alarming|||sex boring pos...
2,INTP,good one|||course say know thats blessing curs...
3,INTJ,dear intp enjoyed conversation day esoteric ga...
4,ENTJ,youre fired|||thats another silly misconceptio...


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   type               8675 non-null   object
 1   linkfree_combined  8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [96]:
df.to_csv('../data/linkfree_combined.csv', index=False)