In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

# List files in your Google Drive
data_path = '/content/drive/MyDrive/Reddit-Data'
print(os.listdir(data_path))

['reddit_comments_gender_female_raw_0.csv', 'reddit_comments_gender_female_raw_2.csv', 'reddit_comments_gender_female_raw_3.csv', 'reddit_comments_gender_female_raw_1.csv', 'reddit_comments_gender_female_raw_4.csv', 'reddit_comments_gender_female_raw_5.csv', '.ipynb_checkpoints', 'gender_female.txt', 'gender', 'reddit_comments_gender_female_merged.csv', 'reddit_comments_gender_female_processed_phrase_annotated.csv']


In [4]:
import pandas as pd
# Define the directory path where your CSV files are located
output_file = "reddit_comments_gender_female_merged.csv"

# List of CSV files to merge
file_list = [
    "reddit_comments_gender_female_raw_0.csv",
    "reddit_comments_gender_female_raw_1.csv",
    "reddit_comments_gender_female_raw_2.csv",
    "reddit_comments_gender_female_raw_3.csv",
    "reddit_comments_gender_female_raw_4.csv",
    "reddit_comments_gender_female_raw_5.csv"
]

# Initialize an empty list to hold DataFrames
df_list = []

# Read and append each file to the list
for file in file_list:
    file_path = os.path.join(data_path, file)
    print(f"Reading file: {file_path}")
    df = pd.read_csv(file_path)
    df_list.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame to a new CSV file
output_path = os.path.join(data_path, output_file)
merged_df.to_csv(output_path, index=False)

print(f"All files merged successfully into: {output_path}")
print(f"Shape of the merged file: {merged_df.shape}")


Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_0.csv
Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_1.csv
Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_2.csv
Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_3.csv
Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_4.csv
Reading file: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_raw_5.csv
All files merged successfully into: /content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_merged.csv
Shape of the merged file: (127279, 3)


In [5]:
# Path to dataset
dataset_path = '/content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_merged.csv'

# Load the dataset
main_df = pd.read_csv(dataset_path)
comments_df = main_df["comments"]
print(comments_df.head())

0    Exactly! Thats why right now I have THREE UPVO...
1    Unless your extremely liberal I don’t see any ...
2    I had very similar feelings a few years back. ...
3    - Created categories for stars still in use to...
4    I'd like to try to break your argument out a l...
Name: comments, dtype: object


In [6]:
import re
import requests
import json


def process_reddit(comment):
    comment = comment.encode("ascii", errors="ignore").decode()
    comment = re.sub('[^A-Za-z,. ]+', '', comment)
    return comment


def process_tweet(sent):
    sent = sent.encode("ascii", errors="ignore").decode()  # check this output
    # print(sent)
    sent = re.sub('@[^\s]+', '', sent)
    sent = re.sub('https: / /t.co /[^\s]+', '', sent)
    sent = re.sub('http: / /t.co /[^\s]+', '', sent)
    sent = re.sub('http[^\s]+', '', sent)

    sent = re.sub('&gt', '', sent)

    # split camel case combined words
    sent = re.sub('([A-Z][a-z]+)', r'\1', re.sub('([A-Z]+)', r' \1', sent))

    sent = sent.lower()

    # remove numbers
    sent = re.sub(' \d+', '', sent)
    # remove words with letter+number
    sent = re.sub('\w+\d+|\d+\w+', '', sent)

    # remove spaces
    sent = re.sub('[\s]+', ' ', sent)
    sent = re.sub('[^\w\s.!\-?]', '', sent)

    # remove 2 or more repeated char
    sent = re.sub(r"(.)\1{2,}", r"\1", sent)
    sent = re.sub(" rt ", "", sent)

    sent = re.sub('- ', '', sent)

    sent = sent.strip()
    # print(sent)
    return sent


if __name__ == '__main__':

    demo = 'gender' # 'gender' # 'race' # 'religion2' #  # 'race'  # 'race' #'gender' # 'religion'
    demo_1 = 'female'

    PROCESS_DEMO1 = True

    # Process Reddit comments in all raw files and store in processed file for Target group 1
    if PROCESS_DEMO1:
        print('Processing demo1 reddit files...')
        colNames = ('id', 'comments', 'comments_processed')

        demo1_df_processed = pd.DataFrame(columns=colNames)
        df_list = []
        if demo == 'gender':
            loops = 7
        else:
            loops = None
            print('Specify a correct demographic')


        demo1_df = main_df
        demo1_df = demo1_df.loc[:, ~demo1_df.columns.str.contains('^Unnamed')]

        demo1_df = demo1_df.dropna()

        demo1_df['comments_processed'] = demo1_df['comments'].apply(lambda x: process_tweet(x))
        print('Before length filter {}'.format(demo1_df.shape))
        demo1_df = demo1_df[demo1_df['comments_processed'].str.len() < 150]
        # pd.concat([demo1_df_processed, demo1_df])
        print('After length filter {}'.format(demo1_df.shape))
        # demo1_df_processed.append(demo1_df, ignore_index=True)
        df_list.append(demo1_df)

        demo1_df_processed = pd.concat(df_list, ignore_index=True)
        print("demo1_df_processed.shape-->", demo1_df_processed.shape)
        demo1_df_processed = demo1_df_processed.dropna()
        demo1_df_processed = demo1_df_processed[demo1_df_processed['comments_processed'] != 'nan']
        print('After dropping nan {}'.format(demo1_df_processed.shape))

        # Ensure the directory exists
        output_dir = data_path + '/' + demo
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Directory created: {output_dir}")

        # Save the processed file
        demo1_df_processed.to_csv(output_dir + '/' + 'reddit_comments_' + demo + '_' + demo_1 + '_processed' + '.csv', index=False)


    # If demo is gender or orientation retain sentences with only one target group term
    if demo == 'gender':
        demo_2 = 'male'
        colNames = ('id', 'comments_processed')
        demo2_df = pd.DataFrame(columns=colNames)

        gender_words = ['woman', 'women', 'girl', 'mother', 'daughter', 'wife', 'niece', 'mom', 'bride', 'lady',
                        'madam',
                        'hostess', 'female', 'wife', 'aunt', 'sister', 'man', 'men', 'boy', 'father', 'son', 'husband',
                        'nephew', 'dad', 'groom', 'gentleman', 'sir', 'host', 'male', 'husband', 'uncle', 'brother']
        comments_one_g = []
        for idx, row in demo1_df_processed.iterrows():
            s = row['comments_processed']
            match = {m for m in gender_words if m in s}
            print(match)
            if len(match) == 1:
                comments_one_g.append(s)
        demo2_df['comments_processed'] = comments_one_g
        print('Shape of df with single target group comments {}'.format(demo2_df.shape))
        demo1_df_processed = demo2_df
        # With this:
        demo1_df_processed.to_csv(os.path.join(data_path, demo, 'reddit_comments_' + demo + '_' + demo_1 + '_processed' + '.csv'), index=False)


    # Create Counter target data set
    demo2_df = pd.DataFrame(columns=['initial_demo', 'replaced_demo', 'comments', 'comments_processed'])

    if demo == 'gender':
        pairs = (('woman', 'man'), ('women', 'men'), ('girl', 'boy'), ('mother', 'father'), ('daughter', 'son'), ('wife', 'husband'),
                 ('niece', 'nephew'), ('mom', 'dad'), ('bride', 'groom'), ('lady', 'gentleman'), ('madam', 'sir'), ('hostess', 'host'),
                 ('female', 'male'), ('wife', 'husband'), ('aunt', 'uncle'), ('sister', 'brother'), (' she ', ' he '))

    for idx, row in demo1_df_processed.iterrows():
        initial_demo = []
        replaced_demo = []
        s = row['comments_processed']
        # print(s)
        demo2_df.at[idx, 'comments'] = s

        for p in pairs:
            # s = s.replace(*p)
            if demo == 'gender':
                s = s.replace(*p)


            if p[1] in s and p[0] in row['comments_processed']:
                initial_demo.append(p[0])
                replaced_demo.append(p[1])
        demo2_df.at[idx, 'comments_processed'] = s
        demo2_df.at[idx, 'initial_demo'] = initial_demo
        demo2_df.at[idx, 'replaced_demo'] = replaced_demo

    print('Shape of demo2 data {}'.format(demo2_df.shape))
    demo2_df.to_csv(data_path + '/' + demo + '/' + 'reddit_comments_' + demo + '_' + demo_2 + '_processed' + '.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'woman', 'man'}
{'woman', 'man'}
{'wife', 'sir'}
{'wife'}
{'wife', 'mom'}
{'wife', 'men'}
{'girl'}
{'girl'}
{'girl'}
{'man', 'girl'}
{'girl'}
{'daughter', 'dad', 'girl'}
{'son', 'girl'}
{'girl', 'lady'}
{'girl'}
{'wife'}
{'wife'}
{'son', 'wife', 'men', 'women'}
{'wife'}
{'wife'}
{'wife'}
{'wife'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'mother', 'girl'}
{'girl'}
{'girl'}
{'man', 'girl'}
{'girl'}
{'girl'}
{'wife', 'girl'}
{'girl'}
{'wife'}
{'wife'}
{'wife'}
{'wife'}
{'wife'}
{'niece'}
{'woman', 'man'}
{'woman', 'man', 'girl'}
{'girl'}
{'son', 'girl', 'lady'}
{'girl'}
{'son', 'girl'}
{'son', 'girl'}
{'girl'}
{'men', 'girl'}
{'girl'}
{'wife'}
{'wife'}
{'wife'}
{'wife'}
{'wife'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'girl'}
{'son', 'girl'}
{'girl'}
{'girl'}
{'wife'}
{'wife'}
{'wife'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'woman', 'man'}
{'dad', 'girl'}

In [7]:
"""
This script generates phrases from processed Reddit comments such that each phrase is maximum length of 15 and
contains target group term and attribute term
"""
import random

demo = 'gender' # 'race' # 'religion2' # 'religion1' # 'gender' #  # 'race'
demo_1 = 'female' # 'female' # 'black' # 'muslims' # 'jews' # 'black_pos' # 'female'  # 'jews'
REMOVE_NO_ATTRIBUTE_IN_WINDOW = True # Remove rows where phrases do not have attributes

demo1_df_processed = pd.read_csv(data_path + '/' + demo + '/' + 'reddit_comments_' + demo + '_' + demo_1 + '_processed' + '.csv')

print(demo1_df_processed.shape)

targets = []
attributes = []

# Since targets in 'demo_opposites.txt'(ex: race_opposites.txt) are phrases('africans are'), here the targets are
# listed separately
if demo == 'gender':
    targets = ['women', 'mothers', 'woman', 'girl', 'wife', 'niece', 'mom', 'moms', 'grandmother', 'stepdaughter', 'bride',
               'lady', 'madam', 'granddaughter', 'hostess', 'girlfriend', 'females', 'wives', 'aunt', 'sisters', 'sister', 'girlfriends']
    with open(data_path + '/' + demo + '_' + demo_1 + '.txt') as f:
        attributes = [re.sub('[*"]', '', line.split('\n')[0]) for line in f]
    print(attributes)


data_list = []

for idx, row in demo1_df_processed.iterrows():
    row_dict = {}
    phrase_joined = ''
    sent = row['comments_processed']
    try:
        sent_list = sent.split(" ")
        print(sent_list)
        targets_in_sent = [t.lower() for t in targets if t.lower() in sent_list]
        print(targets_in_sent)
        # if len(targets_in_sent) == 0:
        #     print(sent)
        for target in targets_in_sent:
            # print(target)
            # target = random.choice(targets_in_sent)

            target_index1, target_index2 = None, None
            target_index1 = sent_list.index(target.strip())

            # print(target_index1)
            # print(sent_list.count(target))

            if sent_list.count(target) > 1:
                sent_list_2 = sent_list[target_index1 + 1:]
                # print('Sentence 2 is {}'.format(sent_list_2))
                target_index2 = sent_list_2.index(target.strip())
                target_index2 = target_index1 + 1 + target_index2

            # print(target_index1, target_index2)

            # If the sentence has two mentions of target group term, select the phrase(cropped sentence) that contains
            # attribute term
            for target_index in [target_index1, target_index2]:

                if target_index is not None:
                    left_window, right_window = target_index-7, target_index+7+1

                    if left_window < 0:
                        left_window = 0
                    phrase_list = sent_list[left_window:right_window]
                    phrase_joined = ' '.join(phrase_list)

                    # Extract the phrase if any of thr pre-defined attributes are in it
                    if any(attr.lower() in phrase_joined for attr in attributes):
                        row_dict['id'] = row['id']
                        row_dict['attribute_in_window'] = True
                        row_dict['comment'] = row['comments_processed']
                        row_dict['phrase'] = phrase_joined
                        data_list.append(row_dict)
                        break

        if not row_dict:
            row_dict['id'] = row['id']
            row_dict['attribute_in_window'] = False
            row_dict['comment'] = row['comments_processed']
            row_dict['phrase'] = phrase_joined
            data_list.append(row_dict)

    except Exception as ex:
        pass


data_df = pd.DataFrame(data_list)
print(data_df.shape)
data_df = data_df.drop_duplicates(subset=['phrase'])
print(data_df.shape)

if REMOVE_NO_ATTRIBUTE_IN_WINDOW:
    data_df = data_df[data_df.attribute_in_window]

print(data_df.shape)

data_df.to_csv(data_path + '/' + demo + '/' + 'reddit_comments_' + demo + '_' + demo_1 + '_processed_phrase' + '.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['girl']
['this', 'girl', 'is', 'monnika', 'from', 'doki', 'doki', 'literature', 'club.']
['girl']
['this', 'girl', 'is', 'monnika', 'from', 'doki', 'doki', 'literature', 'club.']
['girl']
['literature', 'girl', 'is', 'the', 'best', 'running', 'gag', 'in', 'that', 'series.']
['girl']
['specifically', 'the', 'girl', 'is', 'sayori', 'from', 'doki', 'doki', 'literature', 'club']
['girl']
['i', 'believe', 'the', 'girl', 'is', 'monika', 'from', 'doki', 'doki', 'literature', 'club.']
['girl']
['the', 't-posing', 'girl', 'is', 'monika', 'from', 'the', 'game', 'doki', 'doki', 'literature', 'club']
['girl']
['i', 'just', 'realized', 'that', 'the', 'girl', 'is', 'monika', 'from', 'doki', 'doki', 'literature', 'club']
['girl']
['i', 'got', 'into', 'my', 'college', 'of', 'choice', 'yesterday', 'so', 'ya', 'girl', 'is', 'off', 'to', 'study', 'literature', 'for', 'the', 'next', 'years', 'and', 'she', 'cannot', 'wait!']
['girl']
['that'

In [8]:
# sheet1 = pd.read_csv('/content/drive/MyDrive/Reddit-Data/reddit_comments_gender_female_processed_phrase_annotated.csv')  # Replace with your first file path
# sheet2 = pd.read_csv('/content/drive/MyDrive/Reddit-Data/gender/reddit_comments_gender_female_processed_phrase.csv')  # Replace with your second file path

# # Merge the sheets based on columns A and B
# merged_sheet = pd.merge(sheet2, sheet1[['attribute_in_window', 'comment', 'phrase', 'bias_sent', 'bias_phrase']], on=['comment', 'phrase'], how='left')

# # Save the merged result to a new Excel file
# merged_sheet.to_csv('merged_sheet.csv', index=False)

# print("Merging completed. The merged sheet is saved as 'merged_sheet.xlsx'.")

In [1]:
import pandas as pd
from transformers import pipeline

# Load models
sarcasm_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
bias_model = pipeline("text-classification", model="unitary/toxic-bert")

# File paths
input_file_path = '/content/drive/MyDrive/Reddit-Data/gender/reddit_comments_gender_female_processed_phrase.csv'
output_file_path = '/content/drive/MyDrive/Reddit-Data/gender/reddit_comments_gender_female_processed_phrase_annotated.csv'

# Load data
data = pd.read_csv(input_file_path)

# Function to detect sarcasm
def detect_sarcasm(text):
    try:
        result = sarcasm_model(text)
        # Return 1 if negative sentiment (as sarcasm), else 0
        return 1 if result[0]['label'] == 'NEGATIVE' else 0
    except:
        return 0

# Function to detect bias
def detect_bias(text):
    try:
        result = bias_model(text)
        label = result[0]['label']
        # Return 1 if toxic or severely toxic, else 0
        return 1 if label in ['TOXIC', 'SEVERE_TOXIC'] else 0
    except:
        return 0

# Annotate data
data['bias_sent'] = data['comment'].apply(lambda x: max(detect_sarcasm(str(x)), detect_bias(str(x))))
data['bias_phrase'] = data['phrase'].apply(lambda x: max(detect_sarcasm(str(x)), detect_bias(str(x))))

# Save annotated file
data.to_csv(output_file_path, index=False)
print(f"Annotated file saved to {output_file_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Annotated file saved to /content/drive/MyDrive/Reddit-Data/gender/reddit_comments_gender_female_processed_phrase_annotated.csv
