In [5]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [15]:
import pandas as pd
import re
from datasets import load_dataset

# Load the dataset from Hugging Face
print("Loading dataset...")
ds = load_dataset("MLBtrio/genz-slang-dataset")

# Convert dataset to pandas DataFrame
df_hf = ds["train"].to_pandas()

# Function to tokenize sentence with punctuation as separate tokens
def tokenize_with_punctuation(sentence):
    return re.findall(r"\w+|[.,!?;+]", sentence)

# Function to generate BIO tags
def generate_bio_tags(sentence, slang_term):
    lower_sentence = sentence.lower()
    lower_slang_term = slang_term.lower()
    words = tokenize_with_punctuation(lower_sentence)
    slang_words = lower_slang_term.split()
    bio_tags = []

    i = 0
    while i < len(words):
        if words[i:i+len(slang_words)] == slang_words:
            bio_tags.append("B")
            bio_tags.extend(["I"] * (len(slang_words) - 1))
            i += len(slang_words)
        else:
            bio_tags.append("O")
            i += 1

    return words, bio_tags

# Process Hugging Face dataset
final_data_hf = []
for _, row in df_hf.iterrows():
    sentence = row["Example"]  # Use example as the sentence
    slang_term = row["Slang"]  # Use slang for tagging
    words, bio_tags = generate_bio_tags(sentence, slang_term)
    original_words = tokenize_with_punctuation(sentence)  # Retain original casing for output
    final_data_hf.append({
        "sentence": " ".join(original_words),
        "word_labels": ",".join(bio_tags)
    })

final_df_hf = pd.DataFrame(final_data_hf)

# Load the local dataset
file_path = "slang_OpenSub.tsv"
df_local = pd.read_csv(file_path, sep="\t")

# Filter rows with ANNOTATOR_CONFIDENCE >= 2
df_local_filtered = df_local[df_local["ANNOTATOR_CONFIDENCE"] >= 2]

# Process local dataset
final_data_local = []
for _, row in df_local_filtered.iterrows():
    sentence = row["SENTENCE"]
    slang_term = row["SLANG_TERM"]
    words, bio_tags = generate_bio_tags(sentence, slang_term)
    original_words = tokenize_with_punctuation(sentence)  # Retain original casing for output
    final_data_local.append({
        "sentence": " ".join(original_words),
        "word_labels": ",".join(bio_tags)
    })

final_df_local = pd.DataFrame(final_data_local)

# Combine the two datasets
final_combined_df = pd.concat([final_df_hf, final_df_local], ignore_index=True)

# Save to a new file
output_file = "bio_tagged_dataset.csv"
final_combined_df.to_csv(output_file, index=False)

# Display the final combined DataFrame
def display_dataframe_to_user(name: str, dataframe: pd.DataFrame):
    print(f"Displaying DataFrame: {name}")
    print(dataframe.head())

display_dataframe_to_user("Final Combined BIO Tagged Dataset", final_combined_df)


Loading dataset...
Displaying DataFrame: Final Combined BIO Tagged Dataset
                                            sentence  \
0                        Got the job today , big W !   
1         I forgot my wallet at home , that s an L .   
2  Your tweet got 5 likes and 100 replies calling...   
3                             That meme is so dank !   
4  That phrase is so cheugy , no one says that an...   

                       word_labels  
0                  O,O,O,O,O,O,B,O  
1          O,O,O,O,O,O,O,O,O,O,B,O  
2  O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O  
3                      O,O,O,O,B,O  
4          O,O,O,O,B,O,O,O,O,O,O,O  


Tag these sentences mannually

In [20]:
# Count rows with all "O" in the word_labels column and get line numbers
all_o_rows = final_combined_df[final_combined_df["word_labels"].apply(lambda x: set(x.split(",")) == {"O"})]
line_numbers = [index + 2 for index in all_o_rows.index.tolist()]
print(f"Number of rows with all 'O': {len(all_o_rows)}")
print(f"Line numbers of rows with all 'O': {line_numbers}")


Number of rows with all 'O': 228
Line numbers of rows with all 'O': [4, 10, 15, 22, 23, 25, 27, 28, 32, 41, 42, 47, 50, 52, 54, 62, 64, 76, 92, 93, 110, 119, 120, 137, 142, 146, 156, 176, 190, 195, 198, 214, 215, 216, 222, 236, 237, 238, 239, 240, 241, 242, 243, 244, 280, 293, 294, 369, 386, 413, 419, 420, 451, 459, 493, 496, 538, 568, 607, 640, 738, 746, 792, 793, 805, 813, 835, 857, 877, 887, 891, 896, 986, 992, 994, 999, 1007, 1108, 1126, 1142, 1166, 1328, 1413, 1431, 1439, 1442, 1461, 1483, 1485, 1495, 1529, 1574, 1586, 1595, 1629, 1630, 1631, 1645, 1652, 1653, 1674, 1681, 1712, 1714, 1724, 1775, 1788, 1833, 1848, 1891, 1893, 1972, 2012, 2051, 2075, 2076, 2109, 2135, 2194, 2245, 2257, 2270, 2274, 2291, 2368, 2369, 2391, 2417, 2442, 2460, 2497, 2506, 2536, 2700, 2708, 2714, 2716, 2726, 2729, 2731, 2748, 2756, 2757, 2758, 2763, 2781, 2803, 2804, 2835, 2861, 2929, 2932, 2934, 2942, 2950, 2953, 2958, 2969, 2992, 3012, 3052, 3092, 3126, 3128, 3130, 3142, 3143, 3163, 3164, 3206, 3215, 32