In [None]:
# Required imports
import pandas as pd
import json
import datetime as dt
import re
import emoji


In [None]:
# Load the metadata file
csv_file = 'metadata_with_extracted_location.csv'
df = pd.read_csv(csv_file)

# Engineering Time Features

In [None]:
# Creating a function that we apply to the dataframe column 'timestamp'. The function takes each value of this column
# , which is a timestamp in the float format and returns the corresponding data-time object to this timestamp
# Input: 'timestamp' - value of the 'timestamp' column, which is a float
# Output: the date-time object corresponding to the input's timestamp
def create_date_time(timestamp):
    try:
        return dt.datetime.fromtimestamp(timestamp) 
    
    except: 
        print(timestamp)
        print(type(timestamp))

In [None]:
# We have to drop rows, where the 'timestamp' column's value is NaN
df = df.dropna(subset=['timestamp'])

In [None]:
# We apply our defined method, i.e. convert all timestamps into a date-time format and store it in a new column called 'Date-Time'
df['Date-Time'] = df['timestamp'].apply(create_date_time)


In [None]:
# We extract the year, month, day, weekday, hour of each date-time value and store it in seperate columns
df['year'] = df['Date-Time'].dt.year
df['month'] = df['Date-Time'].dt.month
df['day'] = df['Date-Time'].dt.day
df['weekday'] = df['Date-Time'].dt.weekday
df['hour'] = df['Date-Time'].dt.hour 

# Engineering the Caption

In [None]:
# We define three functions that extract relevant information from the caption

# Takes the caption as an input and returns the number of hashtags in the caption
def count_hashtags(caption):
    if pd.isna(caption): 
        return 0
    else:
        return len(re.findall(r'#\w+', caption))


# Takes the caption as an input and returns the number of emojis in the caption
def count_emojis(caption):
    if pd.isna(caption): 
        return 0
    else:
        return len([char for char in caption if char in emoji.EMOJI_DATA])

# Takes the caption as an input and returns the length of the caption
def compute_caption_length(caption):
    if pd.isna(caption): 
        return 0
    else:
        return len(caption)

In [None]:
df.columns

In [None]:
df[pd.isna(df['caption_text'])].shape

In [None]:
df['caption_text'].head()

In [None]:
# We apply the functions, extracting the num_hashtags, num_emojis and the caption_lenght, for each caption
# and storing these values in seperate columns

df['num_hashtags'] = df['caption_text'].apply(count_hashtags)
df['num_emojis'] = df['caption_text'].apply(count_emojis)
df['caption_length'] = df['caption_text'].apply(compute_caption_length)

# Storing the final Dataframe

In [None]:
# Storing the dataframe to a csv file called 'feature_engineered_metadata
df.to_csv('feature_engineered_metadata.csv', index=False)