# facebook messages data prep
## Author: Oliver Gladfelter
### Date: June 2020

In [1]:
import pandas as pd
import os
import json
from datetime import datetime

# Access Data & Convert to a DataFrame

All of my Facebook messages were easily downloadable here: https://www.facebook.com/help/930396167085762?ref=shareable

The zip file I downloaded from Facebook was filled with hundreds of sub-files, one for each Facebook friend I've ever messaged with. Within each sub-file was a file called 'messages.json', which contained the data of interest. 

In [47]:
def scan_folder(parent):
    
    filesList = []
    # iterate over all the files in directory 'parent'
    for folder in os.listdir(parent):
        
        # for each file, create a direct access to the message.json files
        path_to_folder = "".join((parent, "/", folder))
        
        # access all message.json files in folder. 99% of the time there will only be one, occasionally 2-4
        folder_files = os.listdir(path_to_folder)
        
        for file in folder_files:
            if 'message' in file: # if the file has 'message' in its name
                direct_path_to_messages = "".join((path_to_folder, "/", file)) # create a direct path to this file
                filesList.append(direct_path_to_messages) # and append to filesList array
        
    return filesList

def newFriendMessage(message):
    """ 
    Function to identify messages that Facebook automatically injects,
    useful for removing from the dataset
    """
    if "Say hi to your new Facebook friend" in message:
        return 1
    else:
        return 0

In [77]:
files = scan_folder("facebook-olivergladfelter/messages/inbox")

# Reading the json as a dict
data = []

for file in files:
    # for each file, read it as a dictionary and add it to the data list
    with open(file) as json_data:
        data = data + json.load(json_data)['messages']
        
# convert to DF
dataDF = pd.DataFrame(data)

# filter to columns
dataDF = dataDF[['sender_name','timestamp_ms','content','type']]

# remove calls and subscribe/unsubscribe notifications
dataDF = dataDF[dataDF['type']!='Call']
dataDF = dataDF[dataDF['type']!='Unsubscribe']
dataDF = dataDF[dataDF['type']!='Subscribe']

del dataDF['type']

# find and remove new friend messages
dataDF['newFriendMessage'] = dataDF['content'].apply(newFriendMessage)
dataDF = dataDF[dataDF['newFriendMessage'] != 1]
del dataDF['newFriendMessage']

# remove messages from games
dataDF = dataDF[dataDF['sender_name']!="8 Ball Pool Game"]
dataDF = dataDF[dataDF['sender_name']!="Instant Chess"]

# sort chronologically
dataDF = dataDF.sort_values('timestamp_ms')
dataDF = dataDF.reset_index()
del dataDF['index']

In [175]:
print("This dataframe contains", len(dataDF), "messages")

This dataframe contains 233622 messages


# Create year, month, day, time columns from timestamp_ms

In [187]:
# convert time stamp to 4 columns - year, month, day, time

def getDateAndTime(value):
    dateAndTime = datetime.fromtimestamp(value / 1000).isoformat().split('T')
    time = dateAndTime[1]
    date = dateAndTime[0].split('-')
    year = date[0]
    month = date[1]
    day = date[2]
    
    return [year, month, day, time]

def getYear(value):
    return value[0]

def getMonth(value):
    return value[1]

def getDay(value):
    return value[2]

def getTime(value):
    return value[3]
    
dataDF['dateAndTime'] = dataDF['timestamp_ms'].apply(getDateAndTime)
dataDF['year'] = dataDF['dateAndTime'].apply(getYear)
dataDF['month'] = dataDF['dateAndTime'].apply(getMonth)
dataDF['day'] = dataDF['dateAndTime'].apply(getDay)
dataDF['time'] = dataDF['dateAndTime'].apply(getTime)

del dataDF['dateAndTime']

In [192]:
dataDF.to_csv("facebookMessages.csv")