# Keyword Filtering and Manual labeling
This notebook was created to extract political keywords from the Tweet text and randomly extract 6,500 tweets for manual labeling from tweets that contain at least one political keyword

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
base_path = "/content/drive/MyDrive/MRP_Offensive_Content_Detection/Twitter_Data/Part_38/"
dataset_name = "Part_38_processed_new.csv"
df_part38 = pd.read_csv(base_path + dataset_name)

In [None]:
df_part38.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156281 entries, 0 to 156280
Data columns (total 36 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 156281 non-null  int64  
 1   type                       156281 non-null  object 
 2   id                         156281 non-null  int64  
 3   username                   156281 non-null  object 
 4   text                       156281 non-null  object 
 5   url                        156281 non-null  object 
 6   epoch                      156281 non-null  float64
 7   media                      156281 non-null  object 
 8   retweetedTweet             156281 non-null  bool   
 9   retweetedTweetID           0 non-null       float64
 10  retweetedUserID            0 non-null       float64
 11  id_str                     156281 non-null  int64  
 12  lang                       156281 non-null  object 
 13  rawContent                 15

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Prevent text truncation

In [None]:
df_part38 = df_part38.drop(['in_reply_to_user_id_str.1', 'location.1', 'cash_app_handle.1', 'user.1'], axis=1)
df_part38.shape

(156281, 32)

In [None]:
df_part38.duplicated().sum()

np.int64(0)

In [None]:
df_part38.duplicated(subset=['text']).sum()

np.int64(45721)

In [None]:
# Drop all rows with duplicates in text column for manual labeling
df_cleaned = df_part38.drop_duplicates(subset=['text']).copy()

In [None]:
from collections import Counter
import re

# Define the keywords
keywords = [
    "2024 Elections", "2024 Presidential Election", "Biden", "Biden2024", "conservative",
    "CPAC", "Donald Trump", "GOP", "Joe Biden and Kamala Harris", "Joe Biden",
    "Joseph Biden", "KAG", "MAGA", "Nikki Haley", "RNC", "Ron DeSantis", "Snowballing",
    "Trump2024", "trumpsupporters", "trumptrain", "US Elections", "thedemocrats",
    "DNC", "Kamala Harris", "Marianne Williamson", "Dean Phillips", "williamson2024",
    "phillips2024", "Democratic party", "Republican party", "Third Party", "Green Party",
    "Independent Party", "No Labels", "RFK Jr", "Robert F. Kennedy Jr.", "Jill Stein",
    "Cornel West", "ultramaga", "voteblue2024", "letsgobrandon", "bidenharris2024",
    "makeamericagreatagain", "Vivek Ramaswamy",
    "Trump", "Donald", "Joe", "Kamala", "MAKE AMERICA GREAT AGAIN"
]

keyword_patterns = {kw: re.compile(rf'(?i)\b{re.escape(kw)}\b|{re.escape(kw)}', re.IGNORECASE) for kw in keywords}

In [None]:
def find_keywords(text):
    found_keywords = []
    for keyword, pattern in keyword_patterns.items():
        if pattern.search(text):
            found_keywords.append(keyword)
    return found_keywords

In [None]:
# Apply the function to find keywords in each text
df_cleaned['matched_keywords'] = df_cleaned['text'].apply(lambda x: find_keywords(x))
df_cleaned.shape

(110560, 33)

In [None]:
df_cleaned['matched_keywords'].value_counts().shape

(570,)

In [None]:
df_cleaned['matched_keywords'].value_counts().head(10)

Unnamed: 0_level_0,count
matched_keywords,Unnamed: 1_level_1
[Biden],25594
[],16325
"[Donald Trump, Trump, Donald]",12408
"[Kamala Harris, Kamala]",10422
"[Biden, Trump]",8836
[MAGA],5592
"[Biden, Joe Biden, Joe]",3598
[Trump],2668
[conservative],2467
"[Biden, Kamala]",2466


In [None]:
empty_keywords_df = df_cleaned[df_cleaned['matched_keywords'].apply(lambda x: isinstance(x, list) and len(x) == 0)]
empty_keywords_df.shape

(16325, 33)

## Observation:
- There are 570 unique keyword combinations in this dataset, with 23% of the tweets (25.6K out of 110K) containing only "Biden" as a keyword.
- 16,325 tweets (14.76% of total) do not contain any keywords from the manually defined keyword list. These columns will be dropped for further analysis

In [None]:
df_cleaned.drop(empty_keywords_df.index, inplace=True)
df_cleaned.shape

(94235, 33)

In [None]:
df_cleaned.to_csv(base_path + "Part38_processed_final.csv", index=False)

# Randomly Extract 2000 Tweets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
base_path = "/content/drive/MyDrive/MRP_Offensive_Content_Detection/Twitter_Data/Part_38/"
dataset_name = "Part38_processed_final.csv"
df_cleaned = pd.read_csv(base_path + dataset_name)

In [None]:
base_path = "/content/drive/MyDrive/MRP_Offensive_Content_Detection/Twitter_Data/Tweets_for_Manual_Labeling/"

In [None]:
# Randomly select 2000 rows for manual labeling
df_subset = df_cleaned.sample(n=2000, random_state=42)

In [None]:
df_part1 = df_subset.head(1000)
df_part1.to_csv(base_path + "Part38_Random_1000_Tweets_For_Labeling_Part1_new.csv", index=False)
df_part2 = df_subset.tail(1000)
df_part2.to_csv(base_path + "Part38_Random_1000_Tweets_For_Labeling_Part2_new.csv", index=False)

In [None]:
# Remove previous 2K based on text
df_remaining = df_cleaned.drop(df_subset.index)
df_remaining.shape

(92235, 33)

In [None]:
df_remaining.to_csv(base_path + "Part38_Remaining_Unlabeled_Tweets_after_extracting_2K.csv", index=False)

# Extract another 3000 tweets for labeling

In [None]:
# Sample next 3,000
df_subset_next_3000 = df_remaining.sample(n=3000, random_state=77)

In [None]:
df_subset_1k = df_subset_next_3000.head(1000)
df_subset_1k.to_csv(base_path + "Part38_Random_1000_Tweets_For_Labeling_Part3_new.csv", index=False)

In [None]:
df_subset_2k = df_subset_next_3000.tail(2000)
df_subset_2k.to_csv(base_path + "Part38_Random_2000_Tweets_For_Labeling_Part4_new.csv", index=False)

In [None]:
df_remaining_after_5k = df_remaining.drop(df_subset_next_3000.index)
df_remaining_after_5k.shape

(89235, 33)

In [None]:
df_remaining_after_5k.to_csv(base_path + "Part38_Remaining_Unlabeled_Tweets_after_extracting_5K.csv", index=False)

# Extract additional 1,500 tweets for manual labeling

In [None]:
# Randomly select 3000 rows for manual labeling
df_new_1500 = df_remaining_after_5k.sample(n=1500, random_state=42)

In [None]:
df_new_1500.to_csv(base_path + "Part38_Random_1500_Tweets_For_Labeling_Part5_new.csv", index=False)

In [None]:
df_remaining_after_6500 = df_remaining_after_5k.drop(df_new_1500.index)

In [None]:
df_remaining_after_6500.shape

(87735, 33)

In [None]:
df_remaining_after_6500.to_csv(base_path + "Part38_Remaining_Unlabeled_Tweets_after_extracting_6500.csv", index=False)

# Combining all the randomly sampled tweets

In [None]:
df_combined = pd.concat([df_part1, df_part2, df_subset_1k, df_subset_2k, df_new_1500])

In [None]:
df_combined.shape

(6500, 33)