# Data Wrangling Personalitics Data

## 0.) Load the data

In [2]:
import pandas as pd
import json
import sqlite3
from bs4 import BeautifulSoup
from utils import parse_type_personality_cafe, unpack_topic_user, unpack_subcomments, \
                  unpack_subcomment_user, html_to_text, parse_type_16personality
from utils import TYPES

In [2]:
df = pd.read_csv('../output/discussion_2.csv')
df = df.rename({'id': 'topic_id'}, axis=1)

In [3]:
print('DataFrame Shape:', df.shape)

DataFrame Shape: (65171, 10)


## 1.) Unpack the topic_user column then merge it into the DataFrame

In [4]:
df_users = pd.concat(df[['url', 'topic_user']].apply(unpack_topic_user, axis=1).values)

In [5]:
df = pd.merge(df, df_users, how='inner', on='url')

# Check the modified DF's columns
print('DataFrame Columns:', df.columns)

DataFrame Columns: Index(['comment_list', 'created_at', 'topic_id', 'project', 'server', 'spider',
       'topic_post', 'topic_title', 'topic_user', 'url', 'avatar', 'user_name',
       'user_type', 'posted_time_text', 'posted_datetime'],
      dtype='object')


In [6]:
df.to_csv('../output/topic_discussion.csv', index=False)

## 2.) Unpack the comment_list (subcomment) column then merge it into the DataFrame

In [8]:
df = pd.read_csv('../output/topic_discussion.csv')
df['comment_list'] = df[['comment_list', 'url']].apply(unpack_subcomments, axis=1)

In [9]:
df_subcomment = pd.concat(df['comment_list'].values)

In [10]:
df_subcomment.shape

(605019, 25)

In [11]:
df_subcomment.columns

Index(['id', 'answerBody', 'answerBodyRaw', 'approved', 'approvedAtNice',
       'approvedAtDiff', 'createdAtDiff', 'disapprovalReason',
       'hasDisapprovalReason', 'hasUnreadSubComments', 'own', 'reportCount',
       'reportedByUser', 'reportIgnored', 'reviewed', 'subCommentCount',
       'totalVotingScore', 'unavailable', 'unread', 'updatedByUser',
       'updatedByUserDiff', 'upvotedByUser', 'user', 'states', 'url'],
      dtype='object')

## 2.1.) Unpack the subcomment user column

In [14]:
df_subcomment_user = df_subcomment[['user', 'id']].apply(unpack_subcomment_user, axis=1)

In [15]:
df_subcomment = pd.merge(df_subcomment, df_subcomment_user, how='inner', on='id')

In [16]:
df_subcomment.columns = list(map(lambda x: 'sub-'+ x, df_subcomment.columns))

In [17]:
df_subcomment.shape

(605019, 36)

In [18]:
df_subcomment.to_csv('../output/comment_discussion.csv', index=False)

In [None]:
# df_merged = pd.merge(df, df_subcomment, how='inner', left_on='url', right_on='sub-url')
# print('Merged DF Shape:', df_merged.shape)

In [None]:
# # Check the columns
# df_merged.columns

## 3.) Drop unecessary columns

In [None]:
# df_merged = df_merged.drop(['comment_list', 'topic_user', 'sub-user'], axis=1)

In [None]:
# df_merged.columns

In [None]:
# df_merged.dtypes

In [None]:
# old_mem_usage = df_merged.memory_usage().sum()
# old_mem_usage

In [None]:
# df_merged['sub-id'] = df_merged['sub-id'].astype('int64')
# df_merged['sub-approved'] = df_merged['sub-approved'].astype('int8')
# df_merged['sub-createdAtDiff'] = df_merged['sub-createdAtDiff'].astype('int32')
# df_merged['sub-reportCount'] = df_merged['sub-reportCount'].astype('int32')
# df_merged['sub-reviewed'] = df_merged['sub-reviewed'].astype('int8')
# df_merged['sub-subCommentCount'] = df_merged['sub-subCommentCount'].astype('int32')
# df_merged['sub-totalVotingScore'] = df_merged['sub-totalVotingScore'].astype('int32')
# df_merged['sub-unavailable'] = df_merged['sub-unavailable'].astype('int8')
# df_merged['sub-upvotedByUser'] = df_merged['sub-upvotedByUser'].astype('bool')

# df_merged['sub-updatedByUser'] = df_merged['sub-updatedByUser'].astype('bool')
# df_merged['sub-unread'] = df_merged['sub-unread'].astype('bool')
# df_merged['sub-reportIgnored'] = df_merged['sub-reportIgnored'].astype('bool')
# df_merged['sub-reportedByUser'] = df_merged['sub-reportedByUser'].astype('bool')
# df_merged['sub-own'] = df_merged['sub-own'].astype('bool')
# df_merged['sub-hasUnreadSubComments'] = df_merged['sub-hasUnreadSubComments'].astype('bool')
# df_merged['sub-hasDisapprovalReason'] = df_merged['sub-hasDisapprovalReason'].astype('bool')
# df_merged['sub-disapprovalReason'] = df_merged['sub-disapprovalReason'].fillna('N/A')
# df_merged['sub-states'] = df_merged['sub-states'].apply(lambda x: json.dumps(x))

In [None]:
# new_mem_usage = df_merged.memory_usage().sum()
# new_mem_usage

In [None]:
# memory_saved = (old_mem_usage - new_mem_usage)/1000000

In [None]:
# print('Total Memory Usage saved:', round(memory_saved, 2), 'mb')

In [None]:
# # Save the modified DataFrame
# df_merged.to_csv('../output/modified_discussion.csv', index=False)

## 4.) Aggregating text posts based on ID

In [3]:
def aggregate_data(source):
    if source == 'personalitycafe':
        conn = sqlite3.connect('../output/project_mbti.db')
        df_db = pd.read_sql('SELECT * FROM personalitics', con=conn)
        temp_df = pd.read_csv('../output/personality_cafe.csv')
        temp_df = pd.concat([temp_df, df_db])
        temp_df = temp_df[~(temp_df.duplicated())]
        temp_df['child_text'] = temp_df['child_text'].fillna(' ')
        temp_df = temp_df[~(temp_df['user_type'].isna())]
        temp_df['user_type'] = temp_df['user_type'].apply(parse_type_personality_cafe)
        temp_df = temp_df[(temp_df['user_type'].isin(TYPES))]
        temp_df = temp_df[['user_id', 'child_text', 'user_type']].groupby(['user_id', 'user_type']).agg({'child_text': ' '.join}).reset_index()[['child_text', 'user_type']]
        temp_df.to_csv('../output/aggregated/data_personalitycafe.csv', index=False)
        
    elif source == '16personalities_discussion_comments':
        temp_df = pd.read_csv('../output/comment_discussion.csv')
        temp_df = temp_df[['sub-id', 'sub-name', 'sub-profileUrl', 'sub-answerBody', 'sub-gender', 'sub-type', 'sub-url']]
        temp_df = temp_df.drop(axis=1, index=temp_df[temp_df['sub-type'].isna()].index)
        temp_df = temp_df.drop(axis=1, index=temp_df[temp_df['sub-answerBody'].isna()].index)
        temp_df['sub-answerBody'] = temp_df['sub-answerBody'].apply(html_to_text)
        temp_df = temp_df[['sub-profileUrl', 'sub-answerBody', 'sub-type']].groupby(['sub-profileUrl', 'sub-type']).agg({'sub-answerBody': ' '.join}).reset_index()[['sub-answerBody', 'sub-type']]
        temp_df.to_csv('../output/aggregated/data_discussion_16personalities.csv', index=False)
        
    elif source == '16personalities_pub_comments':
        temp_df = pd.read_csv('../output/sixteenpersonalities.csv')
        temp_df = temp_df[~(temp_df['child_text'].isna())]
        temp_df = temp_df[~(temp_df['user_type'].isna())]
        temp_df['user_type'] = temp_df['user_type'].apply(parse_type_16personality)
        temp_df = temp_df[(temp_df['user_type'].isin(TYPES))]
        temp_df = temp_df[['user_id', 'child_text', 'user_type']].groupby(['user_id', 'user_type']).agg({'child_text': ' '.join}).reset_index()[['child_text', 'user_type']]
        temp_df.to_csv('../output/aggregated/data_pub_16personalities.csv', index=False)

In [4]:
aggregate_data('personalitycafe')

In [13]:
aggregate_data('16personalities_discussion_comments')

  if self.run_code(code, result):


  soup = BeautifulSoup(html.strip())
https://www.youtube.com/watch?v=6_Y3zbRxZ6Q" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
https://www.youtube.com/watch?v=dQHphloYoPU(breathless)
https://www.youtube.com/watch?v=Vy0roGA3hQ4(aashayein)" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


:3" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [14]:
aggregate_data('16personalities_pub_comments')