In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d asaniczka/reddit-on-israel-palestine-daily-updated
!unzip reddit-on-israel-palestine-daily-updated.zip

Dataset URL: https://www.kaggle.com/datasets/asaniczka/reddit-on-israel-palestine-daily-updated
License(s): ODC Attribution License (ODC-By)
reddit-on-israel-palestine-daily-updated.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  reddit-on-israel-palestine-daily-updated.zip
replace legacy/pse_isr_reddit_comments.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: legacy/pse_isr_reddit_comments.csv  
replace reddit_opinion_PSE_ISR.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [11]:
df = pd.read_csv("reddit_opinion_PSE_ISR.csv", dtype={10: str})

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3462109 entries, 0 to 3462108
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   comment_id                  object 
 1   score                       int64  
 2   self_text                   object 
 3   subreddit                   object 
 4   created_time                object 
 5   post_id                     object 
 6   author_name                 object 
 7   controversiality            int64  
 8   ups                         int64  
 9   downs                       int64  
 10  user_is_verified            object 
 11  user_account_created_time   object 
 12  user_awardee_karma          float64
 13  user_awarder_karma          float64
 14  user_link_karma             float64
 15  user_comment_karma          float64
 16  user_total_karma            float64
 17  post_score                  int64  
 18  post_self_text              object 
 19  post_title           

In [13]:
# filter between Oct and Dec 2023
df['post_created_time'] = pd.to_datetime(df['post_created_time'])

start_date = pd.to_datetime('2023-10-01')
end_date = pd.to_datetime('2023-12-31')

df_dated = df[
    (df['post_created_time'] >= start_date) &
    (df['post_created_time'] <= end_date)
]

print(df_dated['post_created_time'].min())
print(df_dated['post_created_time'].max())

2023-10-01 10:52:13
2023-12-30 23:20:36


In [14]:
# filter out posts from underrepresented subreddits
subreddit_counts = df_dated['subreddit'].value_counts()
valid_subreddits = subreddit_counts[subreddit_counts >= 1000].index
df_dated = df_dated[df_dated['subreddit'].isin(valid_subreddits)]
df_dated.info()

<class 'pandas.core.frame.DataFrame'>
Index: 577116 entries, 2867204 to 3460111
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   comment_id                  577116 non-null  object        
 1   score                       577116 non-null  int64         
 2   self_text                   577113 non-null  object        
 3   subreddit                   577116 non-null  object        
 4   created_time                577116 non-null  object        
 5   post_id                     577116 non-null  object        
 6   author_name                 577116 non-null  object        
 7   controversiality            577116 non-null  int64         
 8   ups                         577116 non-null  int64         
 9   downs                       577116 non-null  int64         
 10  user_is_verified            577116 non-null  object        
 11  user_account_created_time   543461 no

In [15]:
# filter out null values
df_dated = df_dated.dropna(subset=['post_self_text'])
display(df_dated[['post_self_text']].head())

Unnamed: 0,post_self_text
2867204,Are you counting the 8-17 year olds that have ...
2871096,"Hello everyone, I hope you all are doing well...."
2871134,After 54 days in captivity- Mia Schem had been...
2871150,Discussion is going to be centralized here.\n\...
2871173,After 54 days in captivity- Mia Schem had been...


In [16]:
# get a random sample of 1000
df_sample = df_dated.sample(n=1000, random_state=42) # using a random state for reproducibility
display(df_sample.head())

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
3297281,k9om2n4,0,&gt;Smoking gun evidence won’t be released unt...,IsraelPalestine,2023-11-17 20:49:23,17xgmy0,CulturalCranberry960,0,0,0,...,53.0,348.0,401.0,35,"editors note: if you liked my article segment,...",The IDF says they have found an “operational H...,0.75,35,0,2023-11-17 14:45:41
2932712,keq6tl4,4,Banned in Iran already.,IsraelPalestine,2023-12-24 10:45:45,18plfym,Less-Plant-4099,0,4,0,...,1.0,5209.0,5210.0,28,Found this free simulation game from 2006 that...,Peacemaker: Peace Simulation Video Game,0.92,28,0,2023-12-24 02:33:08
3059779,kclhvks,1,The UN is a kangaroo court. Israel should be ...,IsraelPalestine,2023-12-09 04:45:49,18drn6w,jwilens,0,1,0,...,110.0,1682.0,1800.0,76,I've been following this account on Twitter (s...,The casualty numbers in Gaza are completely fa...,0.6,76,0,2023-12-08 17:17:37
3359336,k8zu7nh,4,The replies are there for you to read yourself.,IsraelPalestine,2023-11-12 23:23:15,17tizfn,mikebenb,0,4,0,...,507.0,15810.0,16455.0,59,Is exposing people's true feelings about Jews....,The only think to thank Hamas for,0.71,59,0,2023-11-12 12:10:57
3043763,kcsis97,4,"If you actually cared about ""vile, depraved"" w...",IsraelPalestine,2023-12-10 18:02:50,18f8k0d,AhsokaSolo,0,4,0,...,1.0,64980.0,64981.0,0,Although I strongly disagree that being a pos ...,I have a question for those who think that any...,0.35,0,0,2023-12-10 17:28:43


In [24]:
# cleaning text
# remove html tags, user mentions, subreddit references

import re
from bs4 import BeautifulSoup

def clean_text(text):
    if pd.isna(text):
        return ""

    # 1. remove HTML tags, CSS styles
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. remove user mentions like "u/username"
    text = re.sub(r"u/[A-Za-z0-9_-]+", "", text)

    # 3. remove subreddit mentions"
    text = re.sub(r"r/[A-Za-z0-9_-]+", "", text)

    # 4. remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # 5. remove whitespace and line breaks
    text = re.sub(r"\s+", " ", text).strip()

    # 6. lowercase text
    text = text.lower()

    # 7. remove punctuation, but keep periods, question marks, and exclamation points
    text = re.sub(r"[^\w\s.?!]", "", text)

    return text
df_cleaned = df_sample.copy()
df_cleaned['cleaned_text'] = df_sample['post_self_text'].apply(clean_text)
display(df_cleaned[['post_self_text', 'cleaned_text']].head(10))


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,post_self_text,cleaned_text
3297281,"editors note: if you liked my article segment,...",editors note if you liked my article segment c...
2932712,Found this free simulation game from 2006 that...,found this free simulation game from 2006 that...
3059779,I've been following this account on Twitter (s...,ive been following this account on twitter sor...
3359336,Is exposing people's true feelings about Jews....,is exposing peoples true feelings about jews. ...
3043763,Although I strongly disagree that being a pos ...,although i strongly disagree that being a pos ...
2997516,Discussion is going to be centralized here.\n\...,discussion is going to be centralized here. mo...
3151009,https://youtu.be/RFjYUjKdr_A?feature=shared\n\...,i dont know how this sub views the war. wether...
3090352,What is happening in the palestinian territori...,what is happening in the palestinian territori...
3013375,Background then question:\n\n I didn’t realize...,background then question i didnt realize i was...
3040389,It’s crazy to me that Hamas attacked on Octobe...,its crazy to me that hamas attacked on october...


In [25]:
# get needed columns
df_column = df_cleaned[['cleaned_text']]
df_column.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 3297281 to 3169450
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cleaned_text  1000 non-null   object
dtypes: object(1)
memory usage: 15.6+ KB


In [26]:
# eliminate duplicates
df_unique = df_column.drop_duplicates(subset=['cleaned_text'])
display(df_unique.info())

<class 'pandas.core.frame.DataFrame'>
Index: 774 entries, 3297281 to 3169450
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cleaned_text  774 non-null    object
dtypes: object(1)
memory usage: 12.1+ KB


None

In [28]:
# sentence segmentation and punctuation removal
import nltk
from nltk.tokenize import sent_tokenize
import string

# Download the punkt tokenizer if you haven't already
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
except LookupError:
    nltk.download('punkt')

def segment_and_clean_sentences(text):
    if pd.isna(text):
        return []
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        # Remove all punctuation from the sentence
        sentence_no_punct = sentence.translate(str.maketrans('', '', string.punctuation))
        cleaned_sentences.append(sentence_no_punct)
    return cleaned_sentences

# Create a new list to store the individual sentences
sentences_list = []
for index, row in df_unique.iterrows():
    cleaned_sentences = segment_and_clean_sentences(row['cleaned_text'])
    for sentence in cleaned_sentences:
        sentences_list.append({'sentence': sentence})

# Create a new DataFrame from the list of sentences
df_sentences = pd.DataFrame(sentences_list)
display(df_sentences.head())
df_sentences.info()

Unnamed: 0,sentence
0,editors note if you liked my article segment c...
1,reported by cnn the israel defense forces idf ...
2,a video released by the idf displays a substan...
3,however there was no footage supplied of the c...
4,idf spokesman daniel hagari said army engineer...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9390 entries, 0 to 9389
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  9390 non-null   object
dtypes: object(1)
memory usage: 73.5+ KB


In [32]:
# mock data generation
mft_categories = ["care/harm", "fairness/cheating", "loyalty/betrayal", "authority/subversion", "purity/degradation", "none"]
polarities = ["positive", "negative", "neutral"]

data = []
for i, row in df_sentences.iterrows():
    sentence = row['sentence']

    # Randomly assign category and polarity
    category = random.choice(mft_categories)
    polarity = random.choice(polarities)
    category_polarity = f"{category} {polarity}"

    # 70% explicit targets, 30% implicit
    entailed = "yes" if random.random() > 0.3 else "no"

    if entailed == "yes":
        words = sentence.split()
        if len(words) > 3:
            start = random.randint(0, len(words) - 2)
            end = start + 1
            target = words[start]
        else:
            start, end, target = 0, 0, ""
    else:
        start, end, target = 0, 0, ""

    sentence_id = f"{1000000 + i}:{0}"
    data.append({
        "sentence_id": sentence_id,
        "sentence": sentence,
        "target": target,
        "category": category,
        "polarity": polarity,
        "category_polarity": category_polarity,
        "entailed": entailed,
        "start": start,
        "end": end
    })

df_mock = pd.DataFrame(data)
df_mock.head()

Unnamed: 0,sentence_id,sentence,target,category,polarity,category_polarity,entailed,start,end
0,1000000:0,editors note if you liked my article segment c...,liked,fairness/cheating,neutral,fairness/cheating neutral,yes,4,5
1,1000001:0,reported by cnn the israel defense forces idf ...,tunnel,none,neutral,none neutral,yes,15,16
2,1000002:0,a video released by the idf displays a substan...,ground,loyalty/betrayal,neutral,loyalty/betrayal neutral,yes,12,13
3,1000003:0,however there was no footage supplied of the c...,the,care/harm,neutral,care/harm neutral,yes,10,11
4,1000004:0,idf spokesman daniel hagari said army engineer...,,purity/degradation,neutral,purity/degradation neutral,no,0,0


In [35]:
# shuffle dataframe
df_shuffled = df_mock.sample(frac=1, random_state=42).reset_index(drop=True)

# calculate split point
split_point = int(len(df_shuffled) * 0.8)

# split into training and testing sets
df_train = df_shuffled[:split_point]
df_test = df_shuffled[split_point:]

# save to CSV files
df_train.to_csv('df_mock_train.csv', index=False)
df_test.to_csv('df_mock_test.csv', index=False)

print("Training set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Training set shape: (7512, 9)
Test set shape: (1878, 9)


In [37]:
# Save df_train to a TSV file
df_train.to_csv('df_mock_train.tsv', sep='\t', index=False)

# Save df_test to a TSV file
df_test.to_csv('df_mock_test.tsv', sep='\t', index=False)

df_mock_train.tsv and df_mock_test.tsv have been created.


In [34]:
!git clone https://github.com/sysulic/TAS-BERT

Cloning into 'TAS-BERT'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 75 (delta 9), reused 5 (delta 5), pack-reused 61 (from 1)[K
Receiving objects: 100% (75/75), 786.65 KiB | 2.58 MiB/s, done.
Resolving deltas: 100% (30/30), done.
