# IMDB Movie Review Dataset Processing

In [None]:
import pandas as pd
from collections import Counter

imdb_df = pd.read_csv("IMDB-Dataset.csv")

print("Original IMDB distribution:")
print(imdb_df['sentiment'].value_counts())

print(imdb_df.head())

Original IMDB distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
pos_reviews = imdb_df[imdb_df['sentiment'] == 'positive']
neg_reviews = imdb_df[imdb_df['sentiment'] == 'negative']

imbalanced_pos_df = pd.concat([pos_reviews.sample(frac=0.2, random_state=42), neg_reviews])
imbalanced_neg_df = pd.concat([pos_reviews, neg_reviews.sample(frac=0.2, random_state=42)])

print("New Dataset w/ Imbalanced positive")
print(imbalanced_pos_df['sentiment'].value_counts())

print("New Dataset w/ Imbalanced negative")
print(imbalanced_neg_df['sentiment'].value_counts())

#Shuffling
imbalanced_pos_df = imbalanced_pos_df.sample(frac=1, random_state=42).reset_index(drop=True)
imbalanced_neg_df = imbalanced_neg_df.sample(frac=1, random_state=42).reset_index(drop=True)

New Dataset w/ Imbalanced positive
sentiment
negative    25000
positive     5000
Name: count, dtype: int64
New Dataset w/ Imbalanced negative
sentiment
positive    25000
negative     5000
Name: count, dtype: int64


In [None]:
imbalanced_pos_df.to_csv("imdb_unbalanced_positive_dominant.csv", index=False)
imbalanced_neg_df.to_csv("imdb_unbalanced_negative_dominant.csv", index=False)

print("Saved IMDB imbalanced datasets.")

Saved IMDB imbalanced datasets.


# ISOT Fake News Dataset Processing

In [11]:
# Load
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

print(fake_df.head())

fake_df['label'] = 0
real_df['label'] = 1

fake_df = fake_df[['title', 'label']]
real_df = real_df[['title', 'label']]


news_df = pd.concat([fake_df, real_df], ignore_index=True)

print("Original Distribution:")
print(news_df['label'].value_counts())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
Original Distribution:
label
0    23481
1    21417
Name: count, dtype: int64


In [16]:
fake = news_df[news_df['label'] == 0]
real = news_df[news_df['label'] == 1]

# Make Unbalanced versions
imbalanced_real_df = pd.concat([fake, real.sample(frac=0.2, random_state=42)])
imbalanced_fake_df = pd.concat([fake.sample(frac=0.2, random_state=42), real])

print("New Dataset w/ Imbalanced real")
print(imbalanced_real_df['label'].value_counts())

print("New Dataset w/ Imbalanced fake")
print(imbalanced_fake_df['label'].value_counts())

# Shuffle
imbalanced_fake_df = imbalanced_fake_df.sample(frac=1, random_state=42).reset_index(drop=True)
imbalanced_real_df = imbalanced_real_df.sample(frac=1, random_state=42).reset_index(drop=True)

New Dataset w/ Imbalanced real
label
0    23481
1     4283
Name: count, dtype: int64
New Dataset w/ Imbalanced fake
label
1    21417
0     4696
Name: count, dtype: int64


In [17]:
imbalanced_fake_df.to_csv("isot_unbalanced_fake_dominant.csv", index=False)
imbalanced_real_df.to_csv("isot_unbalanced_real_dominant.csv", index=False)

# Hate Speech Detection Dataset Preprocessing

In [18]:
!unzip all_files.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: all_files/13848765_1.txt  
  inflating: all_files/31749680_2.txt  
  inflating: all_files/14414144_3.txt  
  inflating: all_files/13865807_1.txt  
  inflating: all_files/30639951_2.txt  
  inflating: all_files/30623760_4.txt  
  inflating: all_files/13866377_1.txt  
  inflating: all_files/13587943_3.txt  
  inflating: all_files/30756236_1.txt  
  inflating: all_files/30466506_2.txt  
  inflating: all_files/14669384_6.txt  
  inflating: all_files/13061423_2.txt  
  inflating: all_files/30535887_1.txt  
  inflating: all_files/14420818_3.txt  
  inflating: all_files/30474323_2.txt  
  inflating: all_files/14014782_1.txt  
  inflating: all_files/13585848_2.txt  
  inflating: all_files/14665319_1.txt  
  inflating: all_files/13455187_1.txt  
  inflating: all_files/14290356_1.txt  
  inflating: all_files/14665728_4.txt  
  inflating: all_files/14110208_5.txt  
  inflating: all_files/31772063_3.txt  
  inflating: al

In [20]:
import pandas as pd
import os

metadata = pd.read_csv("annotations_metadata.csv")

# Helper to read text files
def load_text_by_file_id(file_id):
    path = f"all_files/{file_id}.txt"
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except FileNotFoundError:
        return None

metadata['text'] = metadata['file_id'].apply(load_text_by_file_id)
metadata = metadata.dropna(subset=['text'])

keep_labels = ['noHate', 'hate']
filtered = metadata[metadata['label'].isin(keep_labels)].copy()

label_map = {'noHate': 0, 'hate': 1}
filtered['label'] = filtered['label'].map(label_map)

hate_df = filtered[['text', 'label']]

print("Filtered Hate Speech distribution:")
print(hate_df['label'].value_counts())
print(hate_df.head())


Filtered Hate Speech distribution:
label
0    9507
1    1196
Name: count, dtype: int64
                                                text  label
0  As of March 13th , 2014 , the booklet had been...      0
1  In order to help increase the booklets downloa...      0
2  ( Simply copy and paste the following text int...      0
3  Click below for a FREE download of a colorfull...      1
4  Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...      0


In [21]:
hate_df.to_csv("hate_speech_unbalanced_noHate_dominant.csv", index=False)