open sub positive labels dataset filtered with annotator text 2,3

2256 rows

In [23]:
import pandas as pd

def process_dataset(dataframe):
    """
    Filters the dataset for rows with ANNOTATOR_CONFIDENCE values 2 and 3,
    and creates a new dataset with 'text' and 'labels' columns.
    """
    # Filter rows where 'ANNOTATOR_CONFIDENCE' is 2 or 3
    filtered_data = dataframe[dataframe['ANNOTATOR_CONFIDENCE'].isin([2, 3])]

    # Create a new dataset with required columns
    new_dataset = pd.DataFrame({
        'text': filtered_data['SENTENCE'],
        'labels': 1  # Assigning 1 to all rows in the 'labels' column
    })

    return new_dataset

# Example usage:
df = pd.read_csv('slang_OpenSub_positive.tsv', sep='\t')  # Replace 'your_file.tsv' with the actual filename
open_sub_postive_df = process_dataset(df)
open_sub_postive_df.reset_index(drop=True, inplace=True)
print(open_sub_postive_df.head())

row_count = open_sub_postive_df.shape[0]
print(f"The number of rows in the new dataset is: {row_count}")

#new_df.to_csv('filtered_slang_dataset.csv', index=False)  # Save the new dataset to a file


                                                text  labels
0     My wife's had an accident with some Quaaludes.       1
1  You just can't put me on watch dog anymore, al...       1
2  I'm sorry, but, you know, you can't barge in a...       1
3  We got movement definitely on Tuesday of that ...       1
4  I waited until you broke up with her, but me too.       1
The number of rows in the new dataset is: 2256


open sub negative/ non slang sentences

17512 rows

In [20]:
import pandas as pd

# Function to process the dataset
def create_non_slang_dataset(dataframe):
    """
    Creates a new dataset where sentences are stored in a column named 'text'
    and labels are assigned the value 0 (non-slang sentences).
    """
    # Create the new dataset with 'text' and 'labels' columns
    new_dataset = pd.DataFrame({
        'text': dataframe['SENTENCE'],
        'labels': 0  # Assigning 0 to all rows in the 'labels' column
    })

    return new_dataset

# Example usage:
df = pd.read_csv('slang_OpenSub_negatives.tsv', sep='\t')
open_sub_negative_df = create_non_slang_dataset(df)
print(open_sub_negative_df.head())

row_count = open_sub_negative_df.shape[0]
print(f"The number of rows in the new dataset is: {row_count}")
# non_slang_dataset.to_csv('non_slang_dataset.csv', index=False)  # Save the dataset to a file


                                                text  labels
0                Oh, man, you made friends with' em.       0
1  He could pitch a hundred-mile-an-hour fastball...       0
2                       It's an account, that's all.       0
3          I thought you were the assistant manager.       0
4            On September 12th, 2001, that was over.       0
The number of rows in the new dataset is: 17512


genz dataset with positive labels

1779 rows

In [17]:
import pandas as pd

def process_textfile_to_dataframe(file_path):
    """
    Reads a text file and processes it into a DataFrame.
    Each line is split into two columns:
    - 'text': The first sentence before the first tab.
    - 'label': The number after the first tab.

    Args:
        file_path (str): Path to the input text file.

    Returns:
        pd.DataFrame: A DataFrame with 'text' and 'label' columns.
    """
    data = []

    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split the line at the first tab
            parts = line.strip().split('\t', 1)  # Split into at most 2 parts
            if len(parts) == 2:
                text = parts[0]  # First part before the tab
                label = parts[1]  # Second part after the tab
                data.append((text, label))  # Append as a tuple

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['text', 'labels'])
    return df

# Example Usage
file_path = "genz_slang_sentences_with_tags.txt"  # Replace with your file path
genz_positive_df = process_textfile_to_dataframe(file_path)
print(genz_positive_df.head())


                                                text labels
0                          Got the job today, big W!      1
1           I forgot my wallet at home, that’s an L.      1
2  Your tweet got 5 likes and 100 replies calling...      1
3                              That meme is so dank!      1
4  That phrase is so cheugy, no one says that any...      1


wsj dataset filtered for long sentences: 27055 rows

In [30]:
import pandas as pd

def process_textfile_to_dataframe(file_path):
    """
    Reads a text file and processes it into a DataFrame.
    Each line is split into two columns:
    - 'text': The first sentence before the first tab.
    - 'label': The number after the first tab.
    Sentences longer than 180 characters are excluded.

    Args:
        file_path (str): Path to the input text file.

    Returns:
        pd.DataFrame: A DataFrame with 'text' and 'label' columns.
    """
    data = []

    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split the line at the first tab
            parts = line.strip().split('\t', 1)  # Split into at most 2 parts
            if len(parts) == 2:
                text = parts[0]  # First part before the tab
                label = parts[1]  # Second part after the tab
                # Exclude sentences longer than 180 characters
                if len(text) <= 100:
                    data.append((text, label))  # Append as a tuple

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['text', 'labels'])
    return df



file_path = "wsj_sentences_with_tags.txt"  # Replace with your file path
wsj_negative_df = process_textfile_to_dataframe(file_path)
print(wsj_negative_df.head())
print(wsj_negative_df.shape)

                                                text labels
0  Pierre Vinken, 61 years old, will join the boa...      0
1  Mr. Vinken is chairman of Elsevier N.V., the D...      0
2  A Lorillard spokewoman said, "This is an old s...      0
3  We're talking about years ago before anyone he...      0
4  From 1953 to 1955, 9.8 billion Kent cigarettes...      0
(11893, 2)


In [31]:
# Ensure 'labels' column is consistent (convert to integer type)
for df in [open_sub_postive_df, genz_positive_df, open_sub_negative_df, wsj_negative_df]:
    df['labels'] = df['labels'].astype(int)

# Concatenate positive samples into one DataFrame
positive_df = pd.concat([open_sub_postive_df, genz_positive_df], ignore_index=True)

# Sample negative labels from each negative dataset
opensub_negative_sample = open_sub_negative_df.sample(n=2017, random_state=42)
wsj_negative_sample = wsj_negative_df.sample(n=2018, random_state=42)

# Concatenate negative samples into one DataFrame
negative_df = pd.concat([opensub_negative_sample, wsj_negative_sample], ignore_index=True)

# Combine positive and negative samples into the final DataFrame
final_df = pd.concat([positive_df, negative_df], ignore_index=True)

# Shuffle the final DataFrame
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final DataFrame to a file
final_df.to_csv('final_dataset.csv', index=False)

print(f"Final dataset saved with {final_df.shape[0]} rows and the following label distribution:")
print(final_df['labels'].value_counts())


Final dataset saved with 8070 rows and the following label distribution:
labels
1    4035
0    4035
Name: count, dtype: int64
