# Proj1

## Load all functions

In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from collections import Counter
import nltk
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import os

In [11]:
# Text Simple Preprocessing
def simple_preprocess(text):
    """
    Perform simple preprocessing on the given text.
    - Convert text to lowercase.
    - Remove non-alphabetic characters, keeping only letters and spaces.
    - Split text into individual words.
    
    Parameters:
    - text (str): The text to be preprocessed.
    
    Returns:
    - list: A list of preprocessed words from the text.
    """
    text = text.lower()  # Convert text into lowercase
    text = ''.join(char for char in text if char.isalpha() or char.isspace())  # Remove non-alphabetic characters
    words = text.split()  # Split text into words
    return words

def process_data(data_path, age_column, text_column, gender_prefix): # proces gender-specific data for analysis
    """
     Process gender-specific data including text preprocessing, sentiment analysis, and age normalization.
     Parameters:
     - data_path (str): Path to the CSV file containing the data.
     - age_column (str): Name of the column containing age information.
     - text_column (str): Name of the column containing text data to be processed.
     - gender_prefix (str): Prefix to distinguish between male and female data.
     Returns:
     - Tuple of processed results: most_common_filtered_words, sentiment_word_freq_by_valid_age
    """

    # data loading
    data_df = pd.read_csv(data_path)
    data_df[f'{gender_prefix}_processed_text'] = data_df[text_column].apply(simple_preprocess) # Text Preprocessing by converting text to lowercase, removing non-alphabetic characters, and tokenizing

    # Calculate word frequency and identify most common words
    all_words = [word for text in data_df[f'{gender_prefix}_processed_text'] for word in text]
    english_stopwords = set(stopwords.words('english'))
    filtered_words = [word for word in all_words if word not in english_stopwords]
    filtered_word_freq = Counter(filtered_words)
    most_common_filtered_words = filtered_word_freq.most_common(20)

    # Instantiate VADER SentimentIntensityAnalyzer: focus on sentiment words to understand emotional tone of text.
    sia = SentimentIntensityAnalyzer()
    vader_lexicon = sia.lexicon
    sentiment_words_from_filtered = [word for word in filtered_words if word in vader_lexicon]
    unique_sentiment_words = list(set(sentiment_words_from_filtered))

    # Applying age normalization
    data_df[f'normalized_age_{gender_prefix}'] = data_df[age_column].apply(normalize_age)
    valid_age_data_df = data_df.dropna(subset=[f'normalized_age_{gender_prefix}'])
    grouped_texts = valid_age_data_df.groupby(f'normalized_age_{gender_prefix}')[f'{gender_prefix}_processed_text']
    age_grouped_text_valid = grouped_texts.apply(lambda texts: ' '.join(' '.join(text) for text in texts))

    # Word frequency analysis with valid age groups
    word_freq_by_valid_age = {}

    for age, text in age_grouped_text_valid.items():
        words = text.split()
        filtered_words = [word for word in words if word not in english_stopwords]
        word_freq = Counter(filtered_words)
        word_freq_by_valid_age[age] = word_freq

    # Analyzing sentiment word frequency by age
    sentiment_word_freq_by_valid_age = pd.DataFrame({
        word: [word_freq_by_valid_age[age][word] for age in word_freq_by_valid_age] for word in unique_sentiment_words
    }, index=word_freq_by_valid_age.keys())

    return data_df, most_common_filtered_words, sentiment_word_freq_by_valid_age

# Ensure stopwords and VADER's lexicon are available
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Age Normalization
def normalize_age(age):
    """
    Function to normalize age values.
    Returns None for non-numeric or invalid age values.
    """
    try:
        return int(float(age))
    except (ValueError, TypeError):
        return None

# Combine age normalization and basic stopwords
def process_data_for_age_with_basic_stopwords_corrected(df, text_column, age_column):
    """
    Process data to calculate word frequencies by age, excluding basic stopwords, and identify top 10 words by percentage.
    Correctly handles non-numeric age values by excluding them from the analysis.
    """
    # Apply simple preprocess and normalize age with exclusion of non-numeric values
    df['processed_text'] = df[text_column].apply(simple_preprocess)
    df['normalized_age'] = df[age_column].apply(lambda x: np.nan if not str(x).isdigit() else int(float(x)))

    # Drop rows with NaN ages
    valid_data_df = df.dropna(subset=['normalized_age'])

    # Aggregate texts by age
    aggregated_texts_by_age = valid_data_df.groupby('normalized_age')['processed_text'].agg(sum)

    # Calculate word frequencies and percentages
    word_freq_percentage_by_age = {}
    for age, texts in aggregated_texts_by_age.items():
        english_stopwords = set(stopwords.words('english'))
        filtered_words = [word for word in texts if word not in english_stopwords]
        word_freq = Counter(filtered_words)
        total_words = sum(word_freq.values())
        word_freq_percentage = {word: (count / total_words) * 100 for word, count in word_freq.items()}
        # Sort words by frequency percentage and get top 10
        top_10_words = sorted(word_freq_percentage.items(), key=lambda x: x[1], reverse=True)[:10]
        word_freq_percentage_by_age[age] = top_10_words

    return word_freq_percentage_by_age

# Extract top 10 words for each age
def extract_word_values(word_freq_data, target_word):
    """
    Extracts and returns the values associated with the target word for each age group
    in the provided data structure.

    Parameters:
    - word_freq_data: A dictionary with age groups as keys and lists of (word, value) tuples as values.
    - target_word: The word for which values are to be extracted across all age groups.

    Returns:
    - results_df: A pandas DataFrame with two columns: 'Age' and '{target_word} Value', where each row corresponds
      to an age group and its value for the target word. If the target word is not present, the value will be None.
    """
    # Initialize an empty dictionary to store the results
    results = {}
    
    # Iterate over each age group in the data
    for age, word_values in word_freq_data.items():
        # Initialize the value for the target word as None for each age group
        value_for_target_word = None
        
        # Search for the target word entry
        for word, value in word_values:
            if word == target_word:
                value_for_target_word = value
                break  # Stop searching once the target word is found
        
        # Assign the found value or None to the results dictionary
        results[age] = value_for_target_word

    # Convert the dictionary to a DataFrame and dynamically name the 'Value' column based on the target word
    results_df = pd.DataFrame(list(results.items()), columns=['Age', f'{target_word} Value'])

    # Return the DataFrame
    return results_df

def merge_word_values_by_age(word_freq_data, words):
    """
    Merges the values for a list of words across age groups into a single DataFrame.

    Parameters:
    - word_freq_data: A dictionary with age groups as keys and lists of (word, value) tuples as values.
    - words: A list of words to extract and merge values for.

    Returns:
    - merged_results_df: A pandas DataFrame with age groups as rows and each word's values as columns.
    """
    # Initialize an empty DataFrame to hold the merged results
    merged_results_df = pd.DataFrame()

    # Iterate over each word to extract its values and merge the results
    for word in words:
        # Extract values for the current word
        results_df = extract_word_values(word_freq_data, word)
        
        # Rename the column to reflect the current word's values
        results_df.rename(columns={f'{word} Value': f'{word}_value'}, inplace=True)
        
        # If it's the first word, initialize the merged DataFrame with the age and word's value
        if merged_results_df.empty:
            merged_results_df = results_df
        else:
            # For subsequent words, merge on 'Age' to ensure alignment across age groups
            merged_results_df = pd.merge(merged_results_df, results_df, on='Age', how='outer')

    # Return the final merged DataFrame
    return merged_results_df


# visualize the frequency of a specific word across different ages for male and female participants
def plot_word_frequency(word, merged_df, output_directory = '../figs/word_frequency_compare/'):

    # Check if the word columns exist in the DataFrame, and if not, initialize them to 0
    if f'{word}_male' not in merged_df.columns:
        merged_df[f'{word}_male'] = 0
    if f'{word}_female' not in merged_df.columns:
        merged_df[f'{word}_female'] = 0
        
    # Filling NaN values with 0 for plotting purposes
    plot_data = merged_df[['Age', f'{word}_male', f'{word}_female']].fillna(0)
    
    # Setting the figure size for better visibility, making it wider
    plt.figure(figsize=(14, 6))  # Increased width for a wider chart
    
    # Creating the bar chart
    # Setting the position of bars on the X-axis
    bar_width = 0.35
    r1 = range(len(plot_data))
    r2 = [x + bar_width for x in r1]
    
    # Making the plot
    plt.bar(r1, plot_data[f'{word}_male'], color='b', width=bar_width, edgecolor='grey', label='Male')
    plt.bar(r2, plot_data[f'{word}_female'], color='r', width=bar_width, edgecolor='grey', label='Female')
    
    # Adding labels and title
    plt.xlabel('Age', fontweight='bold')
    plt.xticks([r + bar_width/2 for r in range(len(plot_data))], plot_data['Age'], rotation='vertical')  # Rotate x-axis labels vertically
    plt.ylabel(f'{word.capitalize()} Frequency', fontweight='bold')
    plt.title(f'Comparison of "{word.capitalize()}" Frequency by Age and Gender', fontweight='bold')  # Updated title
    
    # Creating legend & showing the plot
    plt.legend()
    plt.tight_layout()  # Adjust layout to not cut off labels

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
            
    # Define the output path for saving the graph
    output_path = os.path.join(output_directory, f'{word}.jpg')
    
    # Save the plot to the specified output path
    plt.savefig(output_path)
    plt.close()  # Close the plot to avoid displaying it

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacksonzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jacksonzhao/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
# Manipulate data and separated by gender
# Define the path for input and output data
demographic_df = pd.read_csv('../data/demographic.csv')
cleaned_hm_df = pd.read_csv('../data/cleaned_hm.csv')

# Merging the datasets on 'wid' (writer ID)
merged_data = pd.merge(cleaned_hm_df, demographic_df, on='wid', how='inner')

# Filter for participants from the US
us_data = merged_data[merged_data['country'] == 'USA']

# Create datasets based on gender
male_data = us_data[us_data['gender'] == 'm']
female_data = us_data[us_data['gender'] == 'f']

# Save to csv
male_data.to_csv('../output/male_data.csv', index=False)
female_data.to_csv('../output/female_data.csv', index=False)

# Load male and female data separately
male_data_df, male_most_common, sentiment_word_freq_by_valid_age_male = process_data('../output/male_data.csv', 'age', 'cleaned_hm', 'male')
female_data_df, female_most_common, sentiment_word_freq_by_valid_age_female = process_data('../output/female_data.csv', 'age', 'cleaned_hm', 'female')

# Change words to percentage
total_words_per_age_group_male = sentiment_word_freq_by_valid_age_male.sum(axis=1)
sentiment_word_freq_percentages_male = sentiment_word_freq_by_valid_age_male.div(total_words_per_age_group_male, axis=0) * 100

total_words_per_age_group_female = sentiment_word_freq_by_valid_age_female.sum(axis=1)
sentiment_word_freq_percentages_female = sentiment_word_freq_by_valid_age_female.div(total_words_per_age_group_female, axis=0) * 100

# List all words
words = list(set(list(sentiment_word_freq_percentages_male.columns.to_list() + sentiment_word_freq_percentages_female.columns.to_list())))

# prepare and combine sentiment word frequency data from male and female datasets for analysis and visualization.
# Step 1: Rename columns for clarity
sentiment_word_freq_percentages_male.columns = [f'{col}_male' if col != 'age' else col for col in sentiment_word_freq_percentages_male.columns]
sentiment_word_freq_percentages_female.columns = [f'{col}_female' if col != 'age' else col for col in sentiment_word_freq_percentages_female.columns]

# Step 2: Set 'age' as index for both DataFrames
sentiment_word_freq_percentages_male.index.name = 'Age'
sentiment_word_freq_percentages_female.index.name = 'Age'

# Step 3: Perform an outer merge on the index (age)
merged_df = pd.merge(sentiment_word_freq_percentages_male, sentiment_word_freq_percentages_female, left_index=True, right_index=True, how='outer')

# Step 4: Reset index to convert 'age' into a column
merged_df.reset_index(inplace=True)

# Step 5: Show the modified DataFrame
merged_df['Age'] = merged_df['Age'].astype(int)
merged_df

Unnamed: 0,Age,killers_male,resigns_male,tranquil_male,exhilarating_male,granted_male,lamented_male,frustrations_male,expand_male,fond_male,...,wins_female,stall_female,dwell_female,stops_female,friends_female,disappointment_female,heroin_female,suffered_female,unemployment_female,worthwhile_female
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,3,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,4,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,17,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.379562,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,88,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
71,95,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
72,98,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
73,227,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [12]:
# Create Word_frequency_compare for target word value and age between male and female
for word in words:
    plot_word_frequency(word, merged_df)

In [13]:
# process the merged data with corrected approach for basic stopwords
word_freq_percentage_by_age_corrected_total = process_data_for_age_with_basic_stopwords_corrected(merged_data, 'cleaned_hm', 'age')
word_freq_percentage_by_age_corrected_male = process_data_for_age_with_basic_stopwords_corrected(male_data, 'cleaned_hm', 'age')
word_freq_percentage_by_age_corrected_female = process_data_for_age_with_basic_stopwords_corrected(female_data, 'cleaned_hm', 'age')

# Extract word frequency table for total, male, and female
word_frequency_total = merge_word_values_by_age(word_freq_percentage_by_age_corrected_total, words)
word_frequency_male = merge_word_values_by_age(word_freq_percentage_by_age_corrected_male, words)
word_frequency_female = merge_word_values_by_age(word_freq_percentage_by_age_corrected_female, words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = df[text_column].apply(simple_preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['normalized_age'] = df[age_column].apply(lambda x: np.nan if not str(x).isdigit() else int(float(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = df[text_colum