In [2]:

# Author: Madison Laprise
# Date: 6/18
# Description: Using Molly's cleaned dataset, create a new dataframe with the contains/part-of score for each name in the dataset.
# Struggles: When I do this, the contains_value column shows much less variance... most values are 0... I don't know why
# Future plans:


import pandas as pd
df = pd.read_csv(r"C:\Users\MadiL\CodeBase\VoxCroft\cleandata_processed.csv", index_col="Unnamed: 0")

In [3]:
# Create a new DataFrame to store results without altering the original
df_new = df.copy()

# Ensure 'Article_ID' is a valid column; create a sample one if not
if 'Article_ID' not in df.columns:
    df_new['Article_ID'] = df.groupby(df.index // 10).ngroup()  # Sample grouping every 10 rows as an article

# Function to calculate the frequency-based contains/part-of score
def calculate_contains_part_of_score(name, article_id, df):
    """
    Calculate a frequency-based contains/part-of score for a given name within an article.
    
    Parameters:
    name (str): The name to check.
    article_id (int): The ID of the article to check within.
    df (DataFrame): The dataframe containing the data.
    
    Returns:
    int: A negative score proportional to the frequency of the name being part of another name.
    """
    # Ensure the name is a string
    if not isinstance(name, str):
        return 0
    
    # Get all names in the same article and ensure they are strings
    article_names = df[df['Article_ID'] == article_id]['Name'].dropna().astype(str).unique()
    
    # Count how many times 'name' is part of other names in the article
    frequency_count = sum(1 for other_name in article_names if name != other_name and name in other_name)
    
    # Return a negative score proportional to the frequency count
    return -frequency_count

# Apply the function to each name within the context of the same article
df_new['Contains_Part_of_Score'] = df_new.apply(
    lambda row: calculate_contains_part_of_score(row['Name'], row['Article_ID'], df_new),
    axis=1
)

# Display the first few rows of the new dataframe with the new column
print(df_new.head())

# Save the new DataFrame to a CSV file if needed
df_new.to_csv('contains_score.csv', index=False)


   Article_Date_Published                                       Article_Body  \
55           6/11/24 4:00  RT @TheStar_news: Dr Allan Boesak warns: DA's ...   
56           6/11/24 4:00  RT @TheStar_news: Dr Allan Boesak warns: DA's ...   
57           6/11/24 3:58  168798: Stationary Vehicle on N3 Eastbound aft...   
81           6/11/24 2:23  168798: Stationary Vehicle on N3 Eastbound aft...   
95           6/11/24 2:00  RT @TheStar_news: Dr Allan Boesak warns: DA's ...   

            Name Identity_Type Article_Source          Voice  \
55  ALLAN BOESAK        People    twitter.com       The Star   
56     RAMAPHOSA        People    twitter.com       The Star   
57          RAMP        People    twitter.com  i-traffic KZN   
81          RAMP        People    twitter.com  i-traffic KZN   
95  ALLAN BOESAK        People    twitter.com       The Star   

                              Article_Themes_AI_Model  \
55  [Primary: Politics|97% |Secondary: Government ...   
56  [Primary: Politi