In [70]:
import pandas as pd
import json
import csv
from collections import Counter
import re

# Function to clean and split text into words
def extract_words(text):
    """
    Cleans the input text by removing non-alphabetic characters and 
    converting it to lowercase. Then, it splits the text into words.
    
    Parameters:
        text (str): The headline or text to process.
    
    Returns:
        list: A list of words from the text.
    """
    # Remove any non-alphabetic characters and convert to lowercase
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# 1) Function to separate sarcastic headlines from Sarcasm_Headlines_Dataset.json and add them into a DataFrame
def process_sarcasm_json(json_file, df):
    """
    Processes the Sarcasm_Headlines_Dataset.json file, extracting headlines 
    that are sarcastic (is_sarcastic == 1), and appends them to the given DataFrame.
    
    Parameters:
        json_file (str): The path to the JSON file containing sarcastic headlines.
        df (DataFrame): The DataFrame to which sarcastic headlines will be added.
    
    Returns:
        DataFrame: The updated DataFrame containing sarcastic headlines.
    """
    with open(json_file, "r") as file:
        # Load the entire JSON array
        data = json.load(file)
        
    # Extract sarcastic headlines (is_sarcastic == 1)
    new_rows = [{"headline": entry["headline"]} for entry in data if entry["is_sarcastic"] == 1]
    
    # Convert the list of new rows into a DataFrame and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    
    return df

# 2) Function to separate sarcastic headlines from OnionOrNot.csv and add them into a DataFrame
def process_onion_csv(csv_file, df):
    """
    Processes the OnionOrNot.csv file, extracting sarcastic headlines (label == 1),
    and appends them to the given DataFrame.
    
    Parameters:
        csv_file (str): The path to the CSV file containing headlines and labels.
        df (DataFrame): The DataFrame to which sarcastic headlines will be added.
    
    Returns:
        DataFrame: The updated DataFrame containing sarcastic headlines.
    """
    new_rows = []
    with open(csv_file, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        
        for row in reader:
            headline, label = row
            try:
                label = int(label)
                # Only process sarcastic entries (label 1)
                if label == 1:
                    new_rows.append({"headline": headline})
            except ValueError:
                # In case there's an issue converting label to int, ignore that row
                continue
    
    # Convert the list of new rows into a DataFrame and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    return df


# 3) Function to count words from a DataFrame of headlines and update a word count object
def count_words_from_dataframe(df, word_count):
    """
    Counts the frequency of each word in the 'headline' column of the DataFrame 
    and updates the word_count Counter object.
    
    Parameters:
        df (DataFrame): The DataFrame containing the headlines.
        word_count (Counter): The Counter object to update with word frequencies.
    
    Returns:
        Counter: The updated word count object.
    """
    for headline in df["headline"]:
        words = extract_words(headline)
        word_count.update(words)
    return word_count

# 4) Function to write word counts to a text file
def write_word_count_to_file(sorted_word_count, output_file):
    """
    Writes the word frequency counts to a text file.
    
    Parameters:
        sorted_word_count (list): A sorted list of tuples (word, count).
        output_file (str): The path to the output text file.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("Word Frequency Count from Sarcastic Headlines:\n")
        for word, count in sorted_word_count:
            file.write(f"{word}: {count}\n")
            
# 5)
def sort_word_count(word_count, sort_type="frequency", order="desc"):
    """
    Sorts the word count based on the specified sorting type and order.
    
    Parameters:
        word_count (Counter): The Counter object containing word frequencies.
        sort_type (str): The type of sorting to apply ('frequency' or 'length').
        order (str): The order of sorting ('asc' for ascending, 'desc' for descending).
    
    Returns:
        list: A list of tuples (word, count), sorted as specified.
    """
    if sort_type == "frequency":
        # Sort by word frequency (either most to least or least to most)
        sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=(order=="desc"))
    elif sort_type == "length":
        # Sort by word length (either longest to shortest or shortest to longest)
        sorted_word_count = sorted(word_count.items(), key=lambda x: len(x[0]), reverse=(order=="desc"))
    else:
        raise ValueError("Invalid sort_type. Choose 'frequency' or 'length'.")
    
    return sorted_word_count


# 9) Function to add a column to the DataFrame that stores the number of words in each headline
def add_word_count_column(df):
    """
    Adds a new column 'word_count' to the DataFrame, representing 
    the number of words in each headline.

    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.

    Returns:
        DataFrame: The updated DataFrame with a new 'word_count' column.
    """
    if df.empty or 'headline' not in df.columns:
        df['word_count'] = 0
        return df

    df['word_count'] = df['headline'].apply(lambda x: len(extract_words(x)))
    return df

# 11) Function to add a column to the DataFrame that stores the number of characters in each headline
def add_char_count_column(df):
    """
    Adds a new column 'char_count' to the DataFrame, representing 
    the number of characters in each headline (excluding leading/trailing spaces).

    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.

    Returns:
        DataFrame: The updated DataFrame with a new 'char_count' column.
    """
    if df.empty or 'headline' not in df.columns:
        df['char_count'] = 0
        return df

    df['char_count'] = df['headline'].apply(lambda x: len(x.strip()))
    return df

# 13) Function to sort the DataFrame based on a specific column and order
def sort_headlines_by_column(df, column_name, ascending=True):
    """
    Sorts the DataFrame by the specified column and order (ascending or descending).

    Parameters:
        df (DataFrame): The DataFrame to sort.
        column_name (str): The column name to sort by (e.g., 'word_count' or 'char_count').
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.

    Returns:
        DataFrame: The sorted DataFrame.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame. Please ensure it exists.")

    sorted_df = df.sort_values(by=column_name, ascending=ascending).reset_index(drop=True)
    return sorted_df

# 14) Function to 
def write_sorted_df_to_file(df, output_file, column_name, ascending):
    """
    Writes the sorted DataFrame to a text file.
    
    Parameters:
        df (DataFrame): The DataFrame to write to a file.
        output_file (str): The path to the output text file.
        column_name (str): The column name to use for sorting (e.g., 'word_count' or 'char_count').
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # Sort the DataFrame by the given column_name and ascending order
    df_sorted = df.sort_values(by=column_name, ascending=ascending).reset_index(drop=True)
    
    order_title = "ascending" if ascending else "descending"
    
    with open(output_file, "w", encoding="utf-8") as file:
        # Write the header to indicate sorting details
        file.write(f"Sorted Headlines by {column_name} ({order_title}):\n\n")
        
        # Write the headlines to the file
        for _, row in df_sorted.iterrows():
            file.write(f"{row['headline']}\n")

# 15) Function to count frequency of character counts and write to file
def write_char_count_frequency_to_file(df, output_file, ascending=True):
    """
    Writes the frequency of character counts to a file, sorted in either ascending or descending order.
    
    Parameters:
        df (DataFrame): The DataFrame containing the 'char_count' column.
        output_file (str): The path to the output text file.
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # Count the frequency of each character count
    char_count_freq = df['char_count'].value_counts().sort_index(ascending=ascending)
    
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(f"Character Count Frequency (sorted {'ascending' if ascending else 'descending'}):\n\n")
        
        # Write the number of headlines for each character count
        for char_count, count in char_count_freq.items():
            file.write(f"{char_count} character Headline: {count}\n")

def write_word_count_frequency_to_file(df, output_file, ascending=True):
    """
    Writes the frequency of word counts to a file, sorted in either ascending or descending order.
    
    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.
        output_file (str): The path to the output text file.
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # First, add a word count column to the DataFrame
    df = add_word_count_column(df)
    
    # Sort by word count using the provided function
    sorted_df = sort_headlines_by_column(df, 'word_count', ascending)
    
    # Count the frequency of each word count
    word_count_freq = sorted_df['word_count'].value_counts().sort_index(ascending=ascending)
    
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(f"Word Count Frequency (sorted {'ascending' if ascending else 'descending'}):\n\n")
        
        # Write the number of occurrences for each word count
        for count, freq in word_count_freq.items():
            file.write(f"{count} word headline: {freq}\n")



In [80]:
# Satire Headlines

satire_source_path = "./Data_Exploration/Satire_HeadLines_Source/"
satire_analysis_path = "./Data_Exploration/Satire_Headlines_Analysis/"

#Display headlines with occurance of X number of Characters
satire_analysis_healine_length_in_chars_path = satire_analysis_path + "Headline_Occurances_By_Number_Of_Characters/" #Correct
#Displays headlines in order by number of characters
satire_analysis_char_count_chars_path = satire_analysis_path + "Headline_By_Number_Of_Characters_In_Headline/" #Correct

#Display headlines with occurance of X number of words
satire_analysis_healine_length_in_words_path = satire_analysis_path + "Headline_Occurances_By_Number_Of_Words/" #Correct
#Displays headlines in order by number of words
satire_analysis_word_count_words_path = satire_analysis_path + "Headline_By_Number_Of_Words_In_Headline/" #Correct

#Display words by occurance sorted by length
satire_analysis_words_by_length_path = satire_analysis_path + "Words_Occurance_By_Word_Length/"
#Displays headlines in order by number of words
satire_analysis_word_by_count_path = satire_analysis_path + "Words_Occurance_By_Word_Count/" #Correct

# Initialize an empty DataFrame to store sarcastic headlines
satire_df = pd.DataFrame(columns=["headline"])

# Initialize the word count Counter
word_count = Counter()

# Process sarcastic headlines from the JSON file
satire_df = process_sarcasm_json(satire_source_path + "Fixed_Sarcasm_Headlines_Dataset.json", satire_df)

# Process sarcastic headlines from the CSV file
satire_df = process_onion_csv(satire_source_path + "OnionOrNot.csv", satire_df)

# Add word count and character count columns to the DataFrame
length_df = add_word_count_column(satire_df)  # Ensure word_count is added here
length_df = add_char_count_column(length_df)  # Add char_count if needed

# Count words from the DataFrame
word_count = count_words_from_dataframe(satire_df, word_count)

# Example: To write sorted by character count frequency in ascending order
write_char_count_frequency_to_file(length_df, satire_analysis_healine_length_in_chars_path + "char_count_frequency_asc.txt", ascending=True) #Correct

# Example: To write sorted by character count frequency in descending order
write_char_count_frequency_to_file(length_df, satire_analysis_healine_length_in_chars_path + "char_count_frequency_desc.txt", ascending=False) #Correct

# Example: To write word count frequency sorted in ascending order
write_word_count_frequency_to_file(length_df, satire_analysis_healine_length_in_words_path + "headline_length_in_words_asc.txt", ascending=True) #Correct

# Example: To write word count frequency sorted in descending order
write_word_count_frequency_to_file(length_df, satire_analysis_healine_length_in_words_path + "headline_length_in_words_desc.txt", ascending=False) #Correct

# Sort the DataFrame by word count in descending order
sorted_by_words_desc = sort_headlines_by_column(length_df, 'word_count', ascending=False)
write_sorted_df_to_file(sorted_by_words_desc, satire_analysis_word_count_words_path + "sorted_by_word_count_desc.txt", "word_count", ascending=False) #Correct

# Sort the DataFrame by word count in ascending order
sorted_by_words_asc = sort_headlines_by_column(length_df, 'word_count', ascending=True)
write_sorted_df_to_file(sorted_by_words_asc, satire_analysis_word_count_words_path + "sorted_by_word_count_asc.txt", "word_count", ascending=True) #Correct

# Sort the DataFrame by character count in descending order
sorted_by_chars_desc = sort_headlines_by_column(length_df, 'char_count', ascending=False)
write_sorted_df_to_file(sorted_by_chars_desc, satire_analysis_char_count_chars_path + "sorted_by_char_count_desc.txt", "char_count", ascending=False) #Correct

# Sort the DataFrame by character count in ascending order
sorted_by_chars_asc = sort_headlines_by_column(length_df, 'char_count', ascending=True)
write_sorted_df_to_file(sorted_by_chars_asc, satire_analysis_char_count_chars_path + "sorted_by_char_count_asc.txt", "char_count", ascending=True) #Correct

# Sort the word counts and write to a file
# Sort by word count, most to least (descending)
sorted_word_count = sort_word_count(word_count, sort_type="frequency", order="desc")
write_word_count_to_file(sorted_word_count, satire_analysis_word_by_count_path + "word_count_most_to_least.txt")

# Sort by word count, least to most (ascending)
sorted_word_count = sort_word_count(word_count, sort_type="frequency", order="asc")
write_word_count_to_file(sorted_word_count, satire_analysis_word_by_count_path + "word_count_least_to_most.txt")

# Sort by word length, longest to shortest (descending)
sorted_word_count = sort_word_count(word_count, sort_type="length", order="desc")
write_word_count_to_file(sorted_word_count, satire_analysis_words_by_length_path + "word_length_longest_to_shortest.txt")

# Sort by word length, shortest to longest (ascending)
sorted_word_count = sort_word_count(word_count, sort_type="length", order="asc")
write_word_count_to_file(sorted_word_count, satire_analysis_words_by_length_path + "word_length_shortest_to_longest.txt")


In [None]:
def process_true_csv(file_path, df):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Print the actual column names to check for hidden characters or spaces
    print("Original columns:", df.columns)
    
    # Strip any leading/trailing spaces from column names
    df.columns = df.columns.str.strip()
    
    # Rename 'title' to 'headline' if it exists
    if 'title' in df.columns:
        df.rename(columns={'title': 'headline'}, inplace=True)
        print("Renamed 'title' to 'headline'")
    else:
        print("No column named 'title' to rename!")
    
    # Keep only the 'headline' column and drop the rest
    if 'headline' in df.columns:
        df = df[['headline']]  # This keeps only the 'headline' column
        print("Kept only 'headline' column")
    else:
        print("No 'headline' column found after renaming!")
    
    return df

def count_words_from_dataframe(df, word_count):
    for index, row in df.iterrows():
        headline = row['headline']  # Make sure we are accessing 'headline'
        word_count.update(headline.split())  # Update the word count
    return word_count



In [93]:
# Genuine Headlines
genuine_source_path = "./Data_Exploration/Genuine_HeadLines_Source/"
genuine_analysis_path = "./Data_Exploration/Genuine_Headlines_Analysis/"

# Path for genuine headline analysis
genuine_analysis_healine_length_in_chars_path = genuine_analysis_path + "Headline_Occurances_By_Number_Of_Characters/"
genuine_analysis_char_count_chars_path = genuine_analysis_path + "Headline_By_Number_Of_Characters_In_Headline/"

# Display headlines with occurrence of X number of words
genuine_analysis_healine_length_in_words_path = genuine_analysis_path + "Headline_Occurances_By_Number_Of_Words/"
genuine_analysis_word_count_words_path = genuine_analysis_path + "Headline_By_Number_Of_Words_In_Headline/"

# Display words by occurrence sorted by length
genuine_analysis_words_by_length_path = genuine_analysis_path + "Words_Occurance_By_Word_Length/"
genuine_analysis_word_by_count_path = genuine_analysis_path + "Words_Occurance_By_Word_Count/"

# Initialize an empty DataFrame for genuine headlines
genuine_df = pd.DataFrame()

# Initialize word count Counter
word_count = Counter()

# Process genuine headlines from the CSV file
genuine_df = process_true_csv(genuine_source_path + "True.csv", genuine_df)

# Add word count and character count columns to the DataFrame
genuine_df = add_word_count_column(genuine_df)  # Ensure word_count is added here
genuine_df = add_char_count_column(genuine_df)  # Add char_count if needed

# Count words from the DataFrame
word_count = count_words_from_dataframe(genuine_df, word_count)

# Example: To write sorted by character count frequency in ascending order
write_char_count_frequency_to_file(genuine_df, genuine_analysis_healine_length_in_chars_path + "char_count_frequency_asc.txt", ascending=True)

# Example: To write sorted by character count frequency in descending order
write_char_count_frequency_to_file(genuine_df, genuine_analysis_healine_length_in_chars_path + "char_count_frequency_desc.txt", ascending=False)

# Example: To write word count frequency sorted in ascending order
write_word_count_frequency_to_file(genuine_df, genuine_analysis_healine_length_in_words_path + "headline_length_in_words_asc.txt", ascending=True)

# Example: To write word count frequency sorted in descending order
write_word_count_frequency_to_file(genuine_df, genuine_analysis_healine_length_in_words_path + "headline_length_in_words_desc.txt", ascending=False)

# Sort the DataFrame by word count in descending order
sorted_by_words_desc = sort_headlines_by_column(genuine_df, 'word_count', ascending=False)
write_sorted_df_to_file(sorted_by_words_desc, genuine_analysis_word_count_words_path + "sorted_by_word_count_desc.txt", "word_count", ascending=False)

# Sort the DataFrame by word count in ascending order
sorted_by_words_asc = sort_headlines_by_column(genuine_df, 'word_count', ascending=True)
write_sorted_df_to_file(sorted_by_words_asc, genuine_analysis_word_count_words_path + "sorted_by_word_count_asc.txt", "word_count", ascending=True)

# Sort the DataFrame by character count in descending order
sorted_by_chars_desc = sort_headlines_by_column(genuine_df, 'char_count', ascending=False)
write_sorted_df_to_file(sorted_by_chars_desc, genuine_analysis_char_count_chars_path + "sorted_by_char_count_desc.txt", "char_count", ascending=False)

# Sort the DataFrame by character count in ascending order
sorted_by_chars_asc = sort_headlines_by_column(genuine_df, 'char_count', ascending=True)
write_sorted_df_to_file(sorted_by_chars_asc, genuine_analysis_char_count_chars_path + "sorted_by_char_count_asc.txt", "char_count", ascending=True)

# Sort the word counts and write to a file
# Sort by word count, most to least (descending)
sorted_word_count = sort_word_count(word_count, sort_type="frequency", order="desc")
write_word_count_to_file(sorted_word_count, genuine_analysis_word_by_count_path + "word_count_most_to_least.txt")

# Sort by word count, least to most (ascending)
sorted_word_count = sort_word_count(word_count, sort_type="frequency", order="asc")
write_word_count_to_file(sorted_word_count, genuine_analysis_word_by_count_path + "word_count_least_to_most.txt")

# Sort by word length, longest to shortest (descending)
sorted_word_count = sort_word_count(word_count, sort_type="length", order="desc")
write_word_count_to_file(sorted_word_count, genuine_analysis_words_by_length_path + "word_length_longest_to_shortest.txt")

# Sort by word length, shortest to longest (ascending)
sorted_word_count = sort_word_count(word_count, sort_type="length", order="asc")
write_word_count_to_file(sorted_word_count, genuine_analysis_words_by_length_path + "word_length_shortest_to_longest.txt")


Original columns: Index(['title', 'text', 'subject', 'date'], dtype='object')
Renamed 'title' to 'headline'
Kept only 'headline' column


KeyError: 'title'

In [None]:
import pandas as pd
import json
import csv
from collections import Counter
import re

# Function to clean and split text into words
def extract_words(text):
    """
    Cleans the input text by removing non-alphabetic characters and 
    converting it to lowercase. Then, it splits the text into words.
    
    Parameters:
        text (str): The headline or text to process.
    
    Returns:
        list: A list of words from the text.
    """
    # Remove any non-alphabetic characters and convert to lowercase
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# 1) Function to separate sarcastic headlines from Sarcasm_Headlines_Dataset.json and add them into a DataFrame
def process_sarcasm_json(json_file, df):
    """
    Processes the Sarcasm_Headlines_Dataset.json file, extracting headlines 
    that are sarcastic (is_sarcastic == 1), and appends them to the given DataFrame.
    
    Parameters:
        json_file (str): The path to the JSON file containing sarcastic headlines.
        df (DataFrame): The DataFrame to which sarcastic headlines will be added.
    
    Returns:
        DataFrame: The updated DataFrame containing sarcastic headlines.
    """
    with open(json_file, "r") as file:
        # Load the entire JSON array
        data = json.load(file)
        
    # Extract sarcastic headlines (is_sarcastic == 1)
    new_rows = [{"headline": entry["headline"]} for entry in data if entry["is_sarcastic"] == 1]
    
    # Convert the list of new rows into a DataFrame and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    
    return df

# 2) Function to separate sarcastic headlines from OnionOrNot.csv and add them into a DataFrame
def process_onion_csv(csv_file, df):
    """
    Processes the OnionOrNot.csv file, extracting sarcastic headlines (label == 1),
    and appends them to the given DataFrame.
    
    Parameters:
        csv_file (str): The path to the CSV file containing headlines and labels.
        df (DataFrame): The DataFrame to which sarcastic headlines will be added.
    
    Returns:
        DataFrame: The updated DataFrame containing sarcastic headlines.
    """
    new_rows = []
    with open(csv_file, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        
        for row in reader:
            headline, label = row
            try:
                label = int(label)
                # Only process sarcastic entries (label 1)
                if label == 1:
                    new_rows.append({"headline": headline})
            except ValueError:
                # In case there's an issue converting label to int, ignore that row
                continue
    
    # Convert the list of new rows into a DataFrame and concatenate with the original DataFrame
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    return df


# 3) Function to count words from a DataFrame of headlines and update a word count object
def count_words_from_dataframe(df, word_count):
    """
    Counts the frequency of each word in the 'headline' column of the DataFrame 
    and updates the word_count Counter object.
    
    Parameters:
        df (DataFrame): The DataFrame containing the headlines.
        word_count (Counter): The Counter object to update with word frequencies.
    
    Returns:
        Counter: The updated word count object.
    """
    for headline in df["headline"]:
        words = extract_words(headline)
        word_count.update(words)
    return word_count

# 4) Function to write word counts to a text file
def write_word_count_to_file(sorted_word_count, output_file):
    """
    Writes the word frequency counts to a text file.
    
    Parameters:
        sorted_word_count (list): A sorted list of tuples (word, count).
        output_file (str): The path to the output text file.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("Word Frequency Count from Sarcastic Headlines:\n")
        for word, count in sorted_word_count:
            file.write(f"{word}: {count}\n")
            
# 5)
def sort_word_count(word_count, sort_type="frequency", order="desc"):
    """
    Sorts the word count based on the specified sorting type and order.
    
    Parameters:
        word_count (Counter): The Counter object containing word frequencies.
        sort_type (str): The type of sorting to apply ('frequency' or 'length').
        order (str): The order of sorting ('asc' for ascending, 'desc' for descending).
    
    Returns:
        list: A list of tuples (word, count), sorted as specified.
    """
    if sort_type == "frequency":
        # Sort by word frequency (either most to least or least to most)
        sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=(order=="desc"))
    elif sort_type == "length":
        # Sort by word length (either longest to shortest or shortest to longest)
        sorted_word_count = sorted(word_count.items(), key=lambda x: len(x[0]), reverse=(order=="desc"))
    else:
        raise ValueError("Invalid sort_type. Choose 'frequency' or 'length'.")
    
    return sorted_word_count


# 9) Function to add a column to the DataFrame that stores the number of words in each headline
def add_word_count_column(df):
    """
    Adds a new column 'word_count' to the DataFrame, representing 
    the number of words in each headline.

    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.

    Returns:
        DataFrame: The updated DataFrame with a new 'word_count' column.
    """
    if df.empty or 'headline' not in df.columns:
        df['word_count'] = 0
        return df

    df['word_count'] = df['headline'].apply(lambda x: len(extract_words(x)))
    return df

# 11) Function to add a column to the DataFrame that stores the number of characters in each headline
def add_char_count_column(df):
    """
    Adds a new column 'char_count' to the DataFrame, representing 
    the number of characters in each headline (excluding leading/trailing spaces).

    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.

    Returns:
        DataFrame: The updated DataFrame with a new 'char_count' column.
    """
    if df.empty or 'headline' not in df.columns:
        df['char_count'] = 0
        return df

    df['char_count'] = df['headline'].apply(lambda x: len(x.strip()))
    return df

# 13) Function to sort the DataFrame based on a specific column and order
def sort_headlines_by_column(df, column_name, ascending=True):
    """
    Sorts the DataFrame by the specified column and order (ascending or descending).

    Parameters:
        df (DataFrame): The DataFrame to sort.
        column_name (str): The column name to sort by (e.g., 'word_count' or 'char_count').
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.

    Returns:
        DataFrame: The sorted DataFrame.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame. Please ensure it exists.")

    sorted_df = df.sort_values(by=column_name, ascending=ascending).reset_index(drop=True)
    return sorted_df

# 14) Function to 
def write_sorted_df_to_file(df, output_file, column_name, ascending):
    """
    Writes the sorted DataFrame to a text file.
    
    Parameters:
        df (DataFrame): The DataFrame to write to a file.
        output_file (str): The path to the output text file.
        column_name (str): The column name to use for sorting (e.g., 'word_count' or 'char_count').
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # Sort the DataFrame by the given column_name and ascending order
    df_sorted = df.sort_values(by=column_name, ascending=ascending).reset_index(drop=True)
    
    order_title = "ascending" if ascending else "descending"
    
    with open(output_file, "w", encoding="utf-8") as file:
        # Write the header to indicate sorting details
        file.write(f"Sorted Headlines by {column_name} ({order_title}):\n\n")
        
        # Write the headlines to the file
        for _, row in df_sorted.iterrows():
            file.write(f"{row['headline']}\n")

# 15) Function to count frequency of character counts and write to file
def write_char_count_frequency_to_file(df, output_file, ascending=True):
    """
    Writes the frequency of character counts to a file, sorted in either ascending or descending order.
    
    Parameters:
        df (DataFrame): The DataFrame containing the 'char_count' column.
        output_file (str): The path to the output text file.
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # Count the frequency of each character count
    char_count_freq = df['char_count'].value_counts().sort_index(ascending=ascending)
    
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(f"Character Count Frequency (sorted {'ascending' if ascending else 'descending'}):\n\n")
        
        # Write the number of headlines for each character count
        for char_count, count in char_count_freq.items():
            file.write(f"{char_count} character Headline: {count}\n")

def write_word_count_frequency_to_file(df, output_file, ascending=True):
    """
    Writes the frequency of word counts to a file, sorted in either ascending or descending order.
    
    Parameters:
        df (DataFrame): The DataFrame containing the 'headline' column.
        output_file (str): The path to the output text file.
        ascending (bool): If True, sorts in ascending order; if False, sorts in descending order.
    """
    # First, add a word count column to the DataFrame
    df = add_word_count_column(df)
    
    # Sort by word count using the provided function
    sorted_df = sort_headlines_by_column(df, 'word_count', ascending)
    
    # Count the frequency of each word count
    word_count_freq = sorted_df['word_count'].value_counts().sort_index(ascending=ascending)
    
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(f"Word Count Frequency (sorted {'ascending' if ascending else 'descending'}):\n\n")
        
        # Write the number of occurrences for each word count
        for count, freq in word_count_freq.items():
            file.write(f"{count} word headline: {freq}\n")



In [47]:
import json

def process_sarcasm_json(json_file, df):
    """
    Processes the Sarcasm_Headlines_Dataset.json file, extracting headlines 
    that are sarcastic (is_sarcastic == 1), and appends them to the given DataFrame.
    
    Parameters:
        json_file (str): The path to the JSON file containing sarcastic headlines.
        df (DataFrame): The DataFrame to which sarcastic headlines will be added.
    
    Returns:
        DataFrame: The updated DataFrame containing sarcastic headlines.
    """
    new_rows = []
    with open(json_file, "r", encoding="utf-8") as file:
        for i, line in enumerate(file, 1):  # Track line numbers
            try:
                entry = json.loads(line.strip())  # strip to avoid extra spaces
                if entry.get("is_sarcastic") == 1:  # Safely access the key
                    new_rows.append({"headline": entry["headline"]})
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i}: {e}")
                print(f"Line {i} content: {line.strip()}")
                continue  # Skip the problematic line
    
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    return df

satire_source_path = "./Data_Exploration/Satire_HeadLines_Source/"
# Call the function to process the JSON file
satire_df = pd.DataFrame(columns=["headline"])
satire_df = process_sarcasm_json(satire_source_path + "Sarcasm_Headlines_Dataset.json", satire_df)





FileNotFoundError: [Errno 2] No such file or directory: './Data_Exploration/Satire_HeadLines_Source/Sarcasm_Headlines_Dataset.json'

In [40]:
import json

def fix_json_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Remove leading/trailing whitespaces and empty lines
    cleaned_lines = [line.strip() for line in lines if line.strip()]

    # Start the JSON array
    fixed_content = '[' + '\n'

    # Join the cleaned lines with commas and add them to the array
    fixed_content += ',\n'.join(cleaned_lines)

    # End the JSON array
    fixed_content += '\n]'

    # Write the fixed content to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(fixed_content)

# Example usage
fix_json_format('./Data_Exploration/Satire_HeadLines_Source/Sarcasm_Headlines_Dataset.json', './Data_Exploration/Satire_HeadLines_Source/Fixed_Sarcasm_Headlines_Dataset.json')