<a href="https://colab.research.google.com/github/Pacozabala/CSCI199.X-TestSpace/blob/main/data_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d asaniczka/reddit-on-israel-palestine-daily-updated
!unzip reddit-on-israel-palestine-daily-updated.zip

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [None]:
df = pd.read_csv("reddit_opinion_PSE_ISR.csv", dtype={10: str})

In [None]:
df.info()

In [None]:
# filter between Oct and Dec 2023
df['post_created_time'] = pd.to_datetime(df['post_created_time'])

start_date = pd.to_datetime('2023-10-01')
end_date = pd.to_datetime('2023-12-31')

df_dated = df[
    (df['post_created_time'] >= start_date) &
    (df['post_created_time'] <= end_date)
]

print(df_dated['post_created_time'].min())
print(df_dated['post_created_time'].max())

In [None]:
# filter out posts from underrepresented subreddits
subreddit_counts = df_dated['subreddit'].value_counts()
valid_subreddits = subreddit_counts[subreddit_counts >= 1000].index
df_dated = df_dated[df_dated['subreddit'].isin(valid_subreddits)]
df_dated.info()

In [None]:
# filter out null values
df_dated = df_dated.dropna(subset=['post_self_text'])
display(df_dated[['post_self_text']].head())

In [None]:
# get a random sample of 1000
df_sample = df_dated.sample(n=1000, random_state=42) # using a random state for reproducibility
display(df_sample.head())

In [None]:
# cleaning text
# remove html tags, user mentions, subreddit references

import re
from bs4 import BeautifulSoup

def clean_text(text):
    if pd.isna(text):
        return ""

    # 1. remove HTML tags, CSS styles
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. remove user mentions like "u/username"
    text = re.sub(r"u/[A-Za-z0-9_-]+", "", text)

    # 3. remove subreddit mentions"
    text = re.sub(r"r/[A-Za-z0-9_-]+", "", text)

    # 4. remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # 5. remove whitespace and line breaks
    text = re.sub(r"\s+", " ", text).strip()

    # 6. lowercase text
    text = text.lower()

    # 7. remove punctuation, but keep periods, question marks, and exclamation points
    text = re.sub(r"[^\w\s.?!]", "", text)

    return text
df_cleaned = df_sample.copy()
df_cleaned['cleaned_text'] = df_sample['post_self_text'].apply(clean_text)
display(df_cleaned[['post_self_text', 'cleaned_text']].head(10))

In [None]:
# get needed columns
df_column = df_cleaned[['cleaned_text']]
df_column.info()

In [None]:
# eliminate duplicates
df_unique = df_column.drop_duplicates(subset=['cleaned_text'])
display(df_unique.info())

In [None]:
# sentence segmentation and punctuation removal
import nltk
from nltk.tokenize import sent_tokenize
import string

# Download all 'punkt' related resources if they haven't already been downloaded
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except Exception as e:
    print(f"Error downloading 'punkt' resources: {e}")


def segment_and_clean_sentences(text):
    if pd.isna(text):
        return []
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        # Remove all punctuation from the sentence
        sentence_no_punct = sentence.translate(str.maketrans('', '', string.punctuation))
        cleaned_sentences.append(sentence_no_punct)
    return cleaned_sentences

# Create a new list to store the individual sentences
sentences_list = []
for index, row in df_unique.iterrows():
    cleaned_sentences = segment_and_clean_sentences(row['cleaned_text'])
    for sentence in cleaned_sentences:
        sentences_list.append({'sentence': sentence})

# Create a new DataFrame from the list of sentences
df_sentences = pd.DataFrame(sentences_list)
display(df_sentences.head())
df_sentences.info()

In [None]:
# mock data generation
mft_categories = ["care/harm", "fairness/cheating", "loyalty/betrayal", "authority/subversion", "purity/degradation", "none"]
polarities = ["positive", "negative", "neutral"]

data = []
for i, row in df_sentences.iterrows():
    sentence = row['sentence']

    # Randomly assign category and polarity
    category = random.choice(mft_categories)
    polarity = random.choice(polarities)
    category_polarity = f"{category} {polarity}"

    # 70% explicit targets, 30% implicit
    entailed = "yes" if random.random() > 0.3 else "no"

    if entailed == "yes":
        words = sentence.split()
        if len(words) > 3:
            start = random.randint(0, len(words) - 2)
            end = start + 1
            target = words[start]
        else:
            start, end, target = 0, 0, ""
    else:
        start, end, target = 0, 0, ""

    sentence_id = f"{1000000 + i}:{0}"
    data.append({
        "sentence_id": sentence_id,
        "sentence": sentence,
        "target": target,
        "category": category,
        "polarity": polarity,
        "category_polarity": category_polarity,
        "entailed": entailed,
        "start": start,
        "end": end
    })

df_mock = pd.DataFrame(data)
df_mock.head()

In [None]:
# shuffle dataframe
df_shuffled = df_mock.sample(frac=1, random_state=42).reset_index(drop=True)

# calculate split point
split_point = int(len(df_shuffled) * 0.8)

# split into training and testing sets
df_train = df_shuffled[:split_point]
df_test = df_shuffled[split_point:]

# save to CSV files
df_train.to_csv('df_mock_train.csv', index=False)
df_test.to_csv('df_mock_test.csv', index=False)

print("Training set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

In [None]:
# Save df_train to a TSV file
df_train.to_csv('df_mock_train.tsv', sep='\t', index=False)

# Save df_test to a TSV file
df_test.to_csv('df_mock_test.tsv', sep='\t', index=False)

In [None]:
!git clone https://github.com/sysulic/TAS-BERT

In [None]:
!pip install pytorch-crf
import torchcrf
print("torchcrf imported successfully!")

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip -d TAS-BERT/

In [None]:
# command to create BERT-pytorch-model
!python TAS-BERT/convert_tf_checkpoint_to_pytorch.py \
--tf_checkpoint_path TAS-BERT/uncased_L-12_H-768_A-12/bert_model.ckpt \
--bert_config_file TAS-BERT/uncased_L-12_H-768_A-12/bert_config.json \
--pytorch_dump_path TAS-BERT/uncased_L-12_H-768_A-12/pytorch_model.bin

In [None]:
# data pre-prep command
!cd TAS-BERT && cd data && python data_preprocessing_for_TAS.py --dataset semeval2015 && python data_preprocessing_for_TAS.py --dataset semeval2016

In [None]:
# command to train + test model
!cd TAS-BERT && CUDA_VISIBLE_DEVICES=0 python TAS_BERT_joint.py \
--data_dir data/semeval2016/three_joint/BIO/ \
--output_dir results/semeval2016/three_joint/BIO/my_result \
--vocab_file uncased_L-12_H-768_A-12/vocab.txt \
--bert_config_file uncased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint uncased_L-12_H-768_A-12/pytorch_model.bin \
--tokenize_method word_split \
--use_crf \
--eval_test \
--do_lower_case \
--max_seq_length 128 \
--train_batch_size 24 \
--eval_batch_size 8 \
--learning_rate 2e-5 \
--num_train_epochs 30.0