# EDA: Exploratory Data Analysis

## Class Imbalance Investigation

In [1]:
import pandas as pd
TRAIN_DATA = pd.read_csv("../cleaned_data/train.csv")
# Compute Class Proportions
p0 = (TRAIN_DATA['2_way_label'] == 0).mean() # Computes the percentage of our training dataset that has label = 0 [Fake News]
p1 = (TRAIN_DATA['2_way_label'] == 1).mean() # Computes the percentage of our training dataset that has label = 1 [Non-Fake News]
print(f"{p0  * 100}% of our dataset has label = 0 and {p1  * 100}% of our dataset has label = 1")

44.664506061697274% of our dataset has label = 0 and 55.335493938302726% of our dataset has label = 1


## Investigate Properties of Full Dataset

In [2]:
import pandas as pd

# Load the three splits
train_df = pd.read_csv("../data/multimodal_train.tsv", sep="\t")
test_df  = pd.read_csv("../data/multimodal_test_public.tsv", sep="\t")
val_df   = pd.read_csv("../data/multimodal_validate.tsv", sep="\t")

# Concatenate them
full_df = pd.concat([train_df, test_df, val_df], axis=0, ignore_index=True)

full_df.head()

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2
2,prometheus1123,hackers leak emails from uae ambassador to us,1496511000.0,aljazeera.com,True,6f2cy5,https://external-preview.redd.it/6fNhdbc6K1vFA...,,1.0,44,neutralnews,Hackers leak emails from UAE ambassador to US,0.92,1,0,0
3,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,photoshopbattles,PsBattle: Puppy taking in the view,0.95,1,0,0
4,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,pareidolia,I found a face in my sheet music too!,0.84,0,2,2


In [3]:
full_df.shape

(682661, 16)

In [4]:
num_ones = (full_df["2_way_label"] == 1).sum()
num_zeros = (full_df["2_way_label"] == 0).sum()

print(f"Number of Ones: {num_ones}, Number of Zeros: {num_zeros}")

Number of Ones: 268908, Number of Zeros: 413753


In [5]:
text = "three corgis larping at the beach"
text = "volcanic eruption in bali last night"
text = "nascar race stops to wait for family of ducks to pass"
text = "three corgis larping at the beach"
text = "mighty britain getting tied down in south africa during boer war circa"

(full_df["clean_title"] == text).any()

True

In [6]:
full_df[full_df["clean_title"] == text]


Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
572480,Shamsher1812,mighty britain getting tied down in south afri...,1558380000.0,i.redd.it,True,bqzxd4,https://preview.redd.it/cumxxgop1fz21.jpg?widt...,,36.0,1720,propagandaposters,'Mighty' Britain getting tied down in South Af...,0.99,0,1,5


In [7]:
full_df['6_way_label'].value_counts()

6_way_label
0    268908
4    203139
2    129795
1     40516
5     26057
3     14246
Name: count, dtype: int64

## Check 6 way label splits

In [8]:
cleaned_train_df = pd.read_csv("../cleaned_data/train.csv")
cleaned_val_df  = pd.read_csv("../cleaned_data/validation_5k.csv")
cleaned_test_df   = pd.read_csv("../cleaned_data/test_5k.csv")

In [9]:
full_df['6_way_label'].value_counts()

6_way_label
0    268908
4    203139
2    129795
1     40516
5     26057
3     14246
Name: count, dtype: int64

In [10]:
cleaned_train_df['6_way_label'].value_counts()

6_way_label
0    18440
2     8857
1     2806
5     1898
3     1062
4      261
Name: count, dtype: int64

In [11]:
cleaned_val_df['6_way_label'].value_counts()

6_way_label
0    2771
2    1352
1     426
5     267
3     150
4      34
Name: count, dtype: int64

In [12]:
cleaned_test_df['6_way_label'].value_counts()

6_way_label
0    2777
2    1302
1     424
5     288
3     158
4      51
Name: count, dtype: int64