# Data Merge
#### In this notebook we will merge the datasets to produce a single dataset that will be used for the training of the models. 

## Importing Libraries

In [114]:
import pandas as pd
import numpy as np
import os

## Load Datasets

In [115]:
# Load goemotion datasets
path = 'data/before_processing/'
goemotions_1 = pd.read_csv(path + 'goemotions_1.csv')
goemotions_2 = pd.read_csv(path + 'goemotions_2.csv')
goemotions_3 = pd.read_csv(path + 'goemotions_3.csv')

# Load tweet_emotions dataset
tweet_emotions = pd.read_csv(path + 'tweet_emotions.csv')

## Tweet Emotions Dataset Preprocessing
In order to combine Data Sets, we must pre-process the data and put them in the same format.

In [116]:
# Look up to the dataset
tweet_emotions.head(5)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [117]:
# Check for nan values 
tweet_emotions.isnull().sum()
# There are no nan values in the dataset then we can proceed to the next step

tweet_id     0
sentiment    0
content      0
dtype: int64

In [118]:
# Collumns for tweet_emotions:
# tweet_id,sentiment,content
# Collumns for goemotions:
# text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral

# Apply one-hot encoding to the sentiment column of tweet_emotions
tweet_emotions = pd.get_dummies(tweet_emotions, columns=['sentiment'], prefix='', prefix_sep='', dtype=int)

# Rename the content column to match the goemotions dataset
tweet_emotions.rename(columns={'content':'text'}, inplace=True)

# Drop the tweet_id column because it is not necessary
tweet_emotions.drop(columns=['tweet_id'], inplace=True)

In [119]:
# Look up to the processed dataset
tweet_emotions.head(5)

Unnamed: 0,text,anger,boredom,empty,enthusiasm,fun,happiness,hate,love,neutral,relief,sadness,surprise,worry
0,@tiffanylue i know i was listenin to bad habi...,0,0,1,0,0,0,0,0,0,0,0,0,0
1,Layin n bed with a headache ughhhh...waitin o...,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Funeral ceremony...gloomy friday...,0,0,0,0,0,0,0,0,0,0,1,0,0
3,wants to hang out with friends SOON!,0,0,0,1,0,0,0,0,0,0,0,0,0
4,@dannycastillo We want to trade with someone w...,0,0,0,0,0,0,0,0,1,0,0,0,0


## Goemotions Dataset Preprocessing

In [120]:
# Print the columns that exist in the goemotions2 but not in the goemotions1
print(set(goemotions_2.columns) - set(goemotions_1.columns))

set()


In [121]:
# Since {'id', 'rater_id', 'link_id', 'author', 'subreddit', 'created_utc', 'parent_id', 'example_very_unclear'} columns are not in geometrics1, 
# we add these columns as 0 to get rid of null values that will occur after the concatenation.
goemotions_1['id'] = 0
goemotions_1['rater_id'] = 0
goemotions_1['link_id'] = 0
goemotions_1['author'] = 0
goemotions_1['subreddit'] = 0
goemotions_1['created_utc'] = 0
goemotions_1['parent_id'] = 0
goemotions_1['example_very_unclear'] = 0

In [122]:
# Concatenate the goemotions datasets
goemotions = pd.concat([goemotions_1, goemotions_2, goemotions_3], ignore_index=True)

In [123]:
print(goemotions_1.shape)
print(goemotions_2.shape)
print(goemotions_3.shape)
print(goemotions.shape)

(70000, 37)
(70000, 37)
(71225, 37)
(211225, 37)


In [124]:
# Look up to the goemotion datasets
goemotions.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,0,0,0,0,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [125]:
# Check for nan or null values
goemotions.isnull().sum()[goemotions.isnull().sum() > 0]
# There are no nan values in the dataset then we can proceed to the next step

Series([], dtype: int64)

In [126]:
# Get rid of the examples that are very unclear because they won't help the model
goemotions = goemotions[goemotions['example_very_unclear'] == 0]

# Drop the columns that are not necessary
# id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear
goemotions.drop(columns=['id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'], inplace=True)

### Check for the emotions that are in one dataset but not in the other

In [127]:
print(set(goemotions.columns) - set(tweet_emotions.columns))

{'desire', 'realization', 'annoyance', 'disgust', 'fear', 'disappointment', 'optimism', 'embarrassment', 'confusion', 'amusement', 'grief', 'pride', 'joy', 'remorse', 'admiration', 'nervousness', 'caring', 'gratitude', 'excitement', 'curiosity', 'disapproval', 'approval'}


In [128]:
print(set(tweet_emotions.columns) - set(goemotions.columns))

{'boredom', 'empty', 'happiness', 'enthusiasm', 'worry', 'fun', 'hate'}


In [129]:
# Rename the sentiments in tweet_emotions that have different names from goemotion ones but the similar meanings
tweet_emotions.rename(columns={'happiness':'joy', 'enthusiasm':'excitement', 'hate':'disgust', 'worry':'nervousness', 'fun':'amusement'}, inplace=True)

# Add the emotions that are in goemotions but not in tweet_emotions as 0
for emotion in set(goemotions.columns) - set(tweet_emotions.columns):
    tweet_emotions[emotion] = 0
    
# Add the emotions that are in tweet_emotions but not in goemotions as 0
for emotion in set(tweet_emotions.columns) - set(goemotions.columns):
    goemotions[emotion] = 0

## Merge Datasets

In [130]:
# Merge the datasets
data = pd.concat([tweet_emotions, goemotions], ignore_index=True)

In [131]:
# Look up to the merged dataset
data.head(5)

Unnamed: 0,text,anger,boredom,empty,excitement,amusement,joy,disgust,love,neutral,...,disappointment,gratitude,grief,pride,curiosity,optimism,annoyance,approval,remorse,admiration
0,@tiffanylue i know i was listenin to bad habi...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Layin n bed with a headache ughhhh...waitin o...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Funeral ceremony...gloomy friday...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,wants to hang out with friends SOON!,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,@dannycastillo We want to trade with someone w...,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [132]:
# Check for nan values
data.isnull().sum()[data.isnull().sum() > 0]

Series([], dtype: int64)

In [134]:
data.shape

(248943, 31)

## Save the Merged Dataset

In [133]:
# Save the merged dataset
data.to_csv('data/merged_data.csv', index=False)