<a href="https://colab.research.google.com/github/PHMark/project_mbti/blob/master/personalitics/notebooks/Personalitics_Data_Wranggling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Wrangling Personalitics

## 0.) Load the data

In [1]:
!cp "drive/My Drive/ML Projects/Personalitics/notebooks/utils.py" .

cp: cannot stat 'drive/My Drive/ML Projects/Personalitics/notebooks/utils.py': No such file or directory


In [0]:
import pandas as pd
import json
import re
from datetime import datetime
from scipy.stats import mode
import sqlite3
from bs4 import BeautifulSoup
from tqdm import tqdm
# from utils import parse_type_personality_cafe, unpack_topic_user, unpack_comments, \
#                   unpack_comment_user, html_to_text, parse_type_16personality
# from utils import TYPES
import os
from dask import dataframe  as dd

OUTPUT_DIR = r'drive/My Drive/ML Projects/Personalitics/output/'
COMMENT_USER_DIR = OUTPUT_DIR + r'df_comment_user_chunks/'
COMMENT_DIR = OUTPUT_DIR + r'df_comment_chunks/'

In [0]:
from google.colab import drive
drive.mount('/content/drive')

<b> Unpacking json objects </b>
<br>

Here I will preprocess the dataset gathered from the 16Personalities forum. The total number of rows of this dataset is ~900k, and there are 14 columns namely, comment_list, url, id, topic, datetime & 9 more columns which I will not use for now. In total, the dimension of the dataset that I will use is 900k x 3 (comment_list, url, id).

The comment column is in the form of a json object, inside this object is the text comment itself as well as a nested json which contains the information of the user that owns that comment. 

Here is the summarry of the json unpacking:

1.) Load the dataset into chunks. I did this since my computer was not able to handle such large amount of load in its RAM.

2.) Unpack the comment by loading it in the JSON reader and traversing through its keys. Then load the comments into a dataframe.

3.) The same process applies for the user json.

4.) Save the each chunked dataframe to.



In [0]:
df_chunks = pd.read_csv(OUTPUT_DIR + 'discussion_2.csv', chunksize=4000,
                       usecols=['comment_list', 'url', 'id'])
# List of columns of interest
comment_user_cols = ['id', 'profileUrl', 'avatar', 'gender', 'reputation', 'type']
comment_cols = ['id', 'approvedAtNice', 'answerBody', 'url']

for i, chunk in tqdm(enumerate(df_chunks)):
    # Unpack the comment json 
    chunk_comment = chunk[['comment_list', 'url']].apply(unpack_comments, axis=1)
    chunk_comment = pd.concat(chunk_comment.values)
    
    # Unpack the comment_user json
    chunk_comment_user = chunk_comment[['user', 'id']].apply(unpack_comment_user, axis=1)
    

    # Save into a csv file
    chunk_comment_user[comment_user_cols].to_csv(COMMENT_USER_DIR + 'df_comment_user{}.csv'.format(i), index=False)
    chunk_comment[comment_cols].to_csv(COMMENT_DIR + 'df_comment{}.csv'.format(i), index=False)

## 1.) Merging the user and the column dataframe chunks

Since the user DataFrame chunks and the comment DataFrame chunks are associated with each other, I merged them into one DataFrame. Finally, I concatenated all of the DF chunks into a one big DataFrame, with a shape of 1.2m rows x 9 columns.

In [0]:
def get_comment_df():
    tmp_df_ls = []
    for file_user, file_comment in tqdm(zip(os.listdir(COMMENT_USER_DIR), os.listdir(COMMENT_DIR))):
        curr_file_user = os.path.join(COMMENT_USER_DIR, file_user)
        curr_file_comment = os.path.join(COMMENT_DIR, file_comment)

        # Load the csv files into a DataFrame
        tmp_df_comments_user = pd.read_csv(curr_file_user)
        tmp_df_comment = pd.read_csv(curr_file_comment)

        # Merge the DataFrames
        tmp_df = pd.merge(tmp_df_comment, tmp_df_comments_user, how='inner', on='id')
        tmp_df_ls.append(tmp_df)
        
    final_df = pd.concat(tmp_df_ls)
    return final_df

In [0]:
df_merged = get_comment_df()
df_merged.columns = list(map(lambda x: 'sub-'+ x, df_merged.columns))

32it [00:12,  2.56it/s]


In [0]:
df_merged.shape

(1169475, 9)

In [0]:
df_merged.to_csv(OUTPUT_DIR + r'comment_discussion.csv', index=False)

## 2.) Tweak memory usage

In [0]:
# df_merged['sub-id'] = df_merged['sub-id'].astype('int64')
# df_merged['sub-approved'] = df_merged['sub-approved'].astype('int8')
# df_merged['sub-createdAtDiff'] = df_merged['sub-createdAtDiff'].astype('int32')
# df_merged['sub-reportCount'] = df_merged['sub-reportCount'].astype('int32')
# df_merged['sub-reviewed'] = df_merged['sub-reviewed'].astype('int8')
# df_merged['sub-subCommentCount'] = df_merged['sub-subCommentCount'].astype('int32')
# df_merged['sub-totalVotingScore'] = df_merged['sub-totalVotingScore'].astype('int32')
# df_merged['sub-unavailable'] = df_merged['sub-unavailable'].astype('int8')
# df_merged['sub-upvotedByUser'] = df_merged['sub-upvotedByUser'].astype('bool')

# df_merged['sub-updatedByUser'] = df_merged['sub-updatedByUser'].astype('bool')
# df_merged['sub-unread'] = df_merged['sub-unread'].astype('bool')
# df_merged['sub-reportIgnored'] = df_merged['sub-reportIgnored'].astype('bool')
# df_merged['sub-reportedByUser'] = df_merged['sub-reportedByUser'].astype('bool')
# df_merged['sub-own'] = df_merged['sub-own'].astype('bool')
# df_merged['sub-hasUnreadSubComments'] = df_merged['sub-hasUnreadSubComments'].astype('bool')
# df_merged['sub-hasDisapprovalReason'] = df_merged['sub-hasDisapprovalReason'].astype('bool')
# df_merged['sub-disapprovalReason'] = df_merged['sub-disapprovalReason'].fillna('N/A')
# df_merged['sub-states'] = df_merged['sub-states'].apply(lambda x: json.dumps(x))

In [0]:
# new_mem_usage = df_merged.memory_usage().sum()
# new_mem_usage

In [0]:
# memory_saved = (old_mem_usage - new_mem_usage)/1000000

In [0]:
# print('Total Memory Usage saved:', round(memory_saved, 2), 'mb')

In [0]:
# # Save the modified DataFrame
# df_merged.to_csv('../output/modified_comment.csv', index=False)

## 3.) Aggregating text posts based on User ID

In [0]:
def get_date(x):
    try:
        return re.sub('\..+', '', x)
    except:
        print(x)

def strip_time(dtime):
  dt = dtime['created_at'].str
  dt = dt.replace(r' \d+:\d+:\d+', '')
  dt = +  ' ' + dtime[ 'date'].astype(str)
  return dt

def aggregate_data(source):
    if source == 'personalitycafe':
        conn = sqlite3.connect(OUTPUT_DIR + 'project_mbti.db')
        query = '''SELECT user_id, user_type, child_text, date, 
                   created_at FROM personalitics'''
        temp_cols = ['user_id', 'user_type', 'child_text', 'date', 'created_at']
        final_cols = ['user_id', 'child_text', 'date', 'dow', 'user_type']

        # Load the DataFrames
        df_db = pd.read_sql(query, con=conn)
        temp_df = pd.read_csv(OUTPUT_DIR + 'personality_cafe.csv', 
                              usecols=temp_cols)
        
        # Concatenate each DataFrame and Remove Duplicates
        temp_df = pd.concat([temp_df, df_db])
        temp_df = temp_df[~(temp_df.duplicated())]

        # Remove NULL values
        temp_df = temp_df[~(temp_df['user_type'].isna())]
        temp_df = temp_df[~(temp_df['child_text'].isna())]

        # Normalize the type column ie. Parse only the first 4 characters from (INTP-A)
        temp_df['user_type'] = temp_df['user_type'].apply(parse_type_personality_cafe)
        temp_df = temp_df[(temp_df['user_type'].isin(TYPES))]

        # Trim the datetime column
        has_yesterday = temp_df['date'].str.contains(r'Yesterday|Today')
        temp_df.loc[has_yesterday, 'date'] = temp_df.loc[has_yesterday, 'date'].str.strip('Yesterday ').str.strip('Today ')
        temp_df['created_at'] = temp_df['created_at'].apply(get_date)
        temp_df.loc[has_yesterday, 'date'] = temp_df.loc[has_yesterday, ['date', 'created_at']].apply(strip_time, axis=1)
        temp_df['date'] = pd.to_datetime(temp_df['date'])

        # Create a DayofWeek column and Concatenate all comments with |||
        temp_df['dow'] = temp_df['date'].copy()
        temp_df = temp_df[final_cols].groupby(['user_id', 'user_type'])
        agg_func = {'date': lambda x: x.dt.hour.median(),
                    'child_text': '|||'.join,
                    'dow': lambda z: mode(z.dt.dayofweek).mode[0]}
        temp_df = temp_df.agg(agg_func)
        temp_df = temp_df.reset_index()

        # Save DF
        temp_df.to_csv(OUTPUT_DIR + 'aggregated/data_personalitycafe.csv', 
                       index=False)
        
    elif source == '16personalities_discussion_comments':
        # Read CSV into a DataFrame
        temp_df = pd.read_csv(OUTPUT_DIR + 'discussion_comments_16personalities.csv')

        # Remove all NULL Values
        temp_df = temp_df[~temp_df['sub-answerBody'].isna()]

        # Create a DayofWeek column and Concatenate all comments with |||
        agg_func = {'sub-answerBody': '|||'.join, 
                    'sub-approvedAtNice': lambda x: x.median(),
                    'sub-dow': lambda z: mode(z.values)[0]}
        temp_df = temp_df.groupby(['sub-profileUrl',	'sub-type']).agg(agg_func)
        temp_df = temp_df.reset_index()

        # Save DF
        temp_df.to_csv(OUTPUT_DIR + 'aggregated/data_discussion_16personalities.csv',
                       index=False)
        
    elif source == '16personalities_pub_comments':
        final_cols = ['user_id', 'child_text', 'date', 'dow', 'user_type']

        # Read CSV into a DataFrame
        temp_df = pd.read_csv(OUTPUT_DIR + 'sixteenpersonalities.csv')

        # Remove all NULL values
        temp_df = temp_df[~(temp_df['child_text'].isna())]
        temp_df = temp_df[~(temp_df['user_type'].isna())]

        # Normalize the type column ie. Parse only the first 4 characters from (INTP-A)
        temp_df['user_type'] = temp_df['user_type'].apply(parse_type_16personality)
        temp_df = temp_df[(temp_df['user_type'].isin(TYPES))]

        # Covert the date column into a datetime datatype
        temp_df['date'] = pd.to_datetime(temp_df['date'])

        # Create a DayofWeek column and Concatenate all comments with |||
        temp_df['dow'] = temp_df['date'].copy()
        temp_df = temp_df[final_cols].groupby(['user_id', 'user_type'])
        agg_func = {'date': lambda x: x.dt.hour.median(),
                    'child_text': '|||'.join,
                    'dow': lambda z: mode(z.dt.dayofweek).mode[0]}
        temp_df = temp_df.agg(agg_func)
        temp_df = temp_df.reset_index()

        # Save DF
        temp_df.to_csv(OUTPUT_DIR + 'aggregated/data_pub_16personalities.csv', index=False)

In [0]:
aggregate_data('personalitycafe')

In [0]:
aggregate_data('16personalities_discussion_comments')

In [0]:
aggregate_data('16personalities_pub_comments')