In [6]:
# Cell 1: Import required libraries
import pandas as pd
import os
import re

In [29]:
# Cell 1: Import required libraries
import pandas as pd
import os
import re

def clean_trump_dataset(file_path):
    """
    Reads Donald Trump's CSV, removes rows with links and pic.twitter,
    keeps only one column, and adds a 'person' column.
    
    Args:
        file_path (str): Path to the Trump CSV file
        
    Returns:
        pd.DataFrame: Cleaned DataFrame
    """
    # Read the CSV
    df = pd.read_csv(file_path)
    
    # Remove rows containing links or pic.twitter
    mask = ~(
        df['content'].str.contains('http://|https://|pic\.twitter', 
                                 regex=True, 
                                 na=False)
    )
    df = df[mask]
    
    # Keep only the 'content' column and add 'person' column
    df = df[['content']].copy()
    df['person'] = 'Donald Trump'
    
    return df

def reorder_trump_columns(trump_df):
    """
    Renames 'content' to 'text' and reorders columns to [person, text]
    
    Args:
        trump_df (pd.DataFrame): DataFrame with [content, person] columns
    
    Returns:
        pd.DataFrame: DataFrame with renamed and reordered columns [person, text]
    """
    # Rename content to text
    trump_df = trump_df.rename(columns={'content': 'text'})
    
    # Reorder columns
    trump_df = trump_df[['person', 'text']]
    
    return trump_df

In [8]:
# Cell 2: Function to process directory of CSVs
def combine_bob_ross_csv_files(directory_path):
    """
    Combines all CSV files in a directory into a single DataFrame.
    
    Args:
        directory_path (str): Path to directory containing CSV files
        
    Returns:
        pd.DataFrame: Combined DataFrame
    """
    # List to store individual DataFrames
    dfs = []
    
    # Iterate through all CSV files in the directory
    for file in os.listdir(directory_path):
        if file.endswith('.csv'):
            file_path = os.path.join(directory_path, file)
            df = pd.read_csv(file_path)
            dfs.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

In [9]:
# Cell 3: Function to read and process final CSV
def process_holt_csv(file_path):
    """
    Reads the final CSV and drops the index column if present.
    
    Args:
        file_path (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    # Read the CSV
    df = pd.read_csv(file_path)
    
    # Drop index column if it exists
    if 'index' in df.columns:
        df = df.drop('index', axis=1)
    elif df.columns[0].lower() in ['unnamed: 0', 'index']:
        df = df.drop(df.columns[0], axis=1)
    
    return df

In [15]:
trump_df = clean_trump_dataset('data/donald_trump/realdonaldtrump.csv')
print("First dataset shape:", trump_df.shape)

First dataset shape: (32239, 2)


In [11]:
bob_ross_df = combine_bob_ross_csv_files('data/bob_ross')
print("Combined dataset shape:", bob_ross_df.shape)

Combined dataset shape: (13120, 2)


In [12]:
holt_df = process_holt_csv('data/holt/holt.csv')
print("Final dataset shape:", holt_df.shape)

Final dataset shape: (577, 2)


In [26]:
trump_df.columns

Index(['content', 'person'], dtype='object')

In [30]:
trump_df = reorder_trump_columns(trump_df)
trump_df.columns

Index(['person', 'text'], dtype='object')

In [31]:
bob_ross_df.columns

Index(['person', 'text'], dtype='object')

In [32]:
holt_df.columns

Index(['person', 'text'], dtype='object')

In [51]:
def clean_text(text):
    """
    Cleans text by:
    1. Removing leading/trailing quotation marks
    2. Removing @mentions (both with and without spaces)
    
    Args:
        text (str): Input text to clean
        
    Returns:
        str: Cleaned text
    """
    # Remove leading and trailing quotation marks
    text = text.strip('"')
    
    # Remove @mentions:
    # 1. Handles "@someone"
    # 2. Handles "@ someone"
    text = re.sub(r'@\s*\S+\s?', '', text)
    
    # Remove any extra whitespace that might have been created
    text = ' '.join(text.split())
    
    return text

def clean_trump_text(trump_df):
    """
    Applies text cleaning to Trump's dataset.
    
    Args:
        trump_df (pd.DataFrame): DataFrame with 'text' column
        
    Returns:
        pd.DataFrame: DataFrame with cleaned text
    """
    # Create a copy to avoid modifying the original
    cleaned_df = trump_df.copy()
    
    # Apply cleaning to the text column
    cleaned_df['text'] = cleaned_df['text'].apply(clean_text)
    
    # Print some example transformations
    print("Sample of transformations (first 3 rows):")
    for old, new in zip(trump_df['text'].head(3), cleaned_df['text'].head(3)):
        print(f"\nOriginal: {old}")
        print(f"Cleaned:  {new}")
        
    return cleaned_df

# Test examples:
# print(clean_text('"@someone is here"'))  # outputs: "is here"
# print(clean_text('"@ someone is here"'))  # outputs: "is here"
# print(clean_text('""@ someone is here""'))  # outputs: "is here"

In [52]:
trump_df = clean_trump_text(trump_df)

Sample of transformations (first 3 rows):

Original: Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!
Cleaned:  Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!

Original: Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!
Cleaned:  Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!

Original: "My persona will never be that of a wallflower - I’d rather build walls than cling to them" --Donald J. Trump
Cleaned:  My persona will never be that of a wallflower - I’d rather build walls than cling to them" --Donald J. Trump


In [53]:
def balance_datasets(trump_df, bob_ross_df, holt_df):
    """
    Balances all datasets to match the size of the smallest dataset
    by random sampling without replacement.
    
    Args:
        trump_df (pd.DataFrame): Trump dataset
        bob_ross_df (pd.DataFrame): Bob Ross dataset
        holt_df (pd.DataFrame): Holt dataset
        
    Returns:
        tuple: (balanced_trump_df, balanced_bob_ross_df, balanced_holt_df)
    """
    # Get the sizes of each dataset
    sizes = {
        'Trump': len(trump_df),
        'Bob Ross': len(bob_ross_df),
        'Holt': len(holt_df)
    }
    
    # Find the smallest dataset size
    min_size = min(sizes.values())
    
    print("Original dataset sizes:")
    for name, size in sizes.items():
        print(f"{name}: {size} rows")
    print(f"\nBalancing all datasets to {min_size} rows")
    
    # Sample each dataset to the minimum size
    balanced_trump = trump_df.sample(n=min_size, random_state=42) if len(trump_df) > min_size else trump_df
    balanced_bob_ross = bob_ross_df.sample(n=min_size, random_state=42) if len(bob_ross_df) > min_size else bob_ross_df
    balanced_holt = holt_df.sample(n=min_size, random_state=42) if len(holt_df) > min_size else holt_df
    
    # Verify the balancing
    print("\nNew dataset sizes:")
    print(f"Trump: {len(balanced_trump)} rows")
    print(f"Bob Ross: {len(balanced_bob_ross)} rows")
    print(f"Holt: {len(balanced_holt)} rows")
    
    return balanced_trump, balanced_bob_ross, balanced_holt

In [54]:
balanced_trump, balanced_bob_ross, balanced_holt = balance_datasets(trump_df, bob_ross_df, holt_df)

Original dataset sizes:
Trump: 32239 rows
Bob Ross: 13120 rows
Holt: 577 rows

Balancing all datasets to 577 rows

New dataset sizes:
Trump: 577 rows
Bob Ross: 577 rows
Holt: 577 rows


In [55]:
def fuse_datasets(trump_df, bob_ross_df, holt_df):
    """
    Combines the three balanced datasets into one shuffled dataset.
    
    Args:
        trump_df (pd.DataFrame): Balanced Trump dataset
        bob_ross_df (pd.DataFrame): Balanced Bob Ross dataset
        holt_df (pd.DataFrame): Balanced Holt dataset
    
    Returns:
        pd.DataFrame: The combined and shuffled dataset
    """
    # Combine all dataframes
    combined_df = pd.concat([trump_df, bob_ross_df, holt_df], ignore_index=True)
    
    # Shuffle the rows
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Dataset summary:")
    print(f"Total rows: {len(combined_df)}")
    print("\nRows per person:")
    print(combined_df['person'].value_counts())
    
    return combined_df

In [56]:
final_df = fuse_datasets(balanced_trump, balanced_bob_ross, balanced_holt)

Dataset summary:
Total rows: 1731

Rows per person:
person
Bob Ross        577
holt            577
Donald Trump    577
Name: count, dtype: int64


In [57]:
final_df.columns

Index(['person', 'text'], dtype='object')

In [58]:
final_df.head

<bound method NDFrame.head of         person                                               text
0     Bob Ross  there we go then with a clean dry 2in brush ve...
1         holt  Good idea. Everyone? Gather round, so I can ca...
2     Bob Ross  apart from everybody else because you you pay ...
3         holt                  Peralta, what are you doing here?
4         holt                                     Rub, rub, rub.
...        ...                                                ...
1726  Bob Ross  And we can go back and get a little white here...
1727      holt               I know a way to watch porn in Korea.
1728  Bob Ross  If this is gonna be our light source right her...
1729      holt  Okay. It's your case. But if anything goes wro...
1730  Bob Ross  There sort of pretty, they standout. Now then,...

[1731 rows x 2 columns]>

In [60]:
def clean_remaining_quotes(fused_df):
    """
    Removes any remaining leading and trailing quotation marks from the text column
    of the fused dataset.
    
    Args:
        fused_df (pd.DataFrame): Combined dataset with 'text' column
        
    Returns:
        pd.DataFrame: DataFrame with cleaned text
    """
    # Create a copy to avoid modifying the original
    cleaned_df = fused_df.copy()
    
    # Strip quotation marks from all texts
    cleaned_df['text'] = cleaned_df['text'].str.strip('"')
    
    # Print some examples of rows that were changed
    mask = fused_df['text'] != cleaned_df['text']
    if mask.any():
        print("\nSample of rows where quotes were removed:")
        for old, new in zip(fused_df[mask]['text'].head(3), cleaned_df[mask]['text'].head(3)):
            print(f"\nOriginal: {old}")
            print(f"Cleaned:  {new}")
    else:
        print("No quotation marks were found to remove.")
    
    return cleaned_df

In [61]:
final_df = clean_remaining_quotes(final_df)


Sample of rows where quotes were removed:

Original: Hmm. I'm surprised she hasn't marched in here to say "I told you so."
Cleaned:  Hmm. I'm surprised she hasn't marched in here to say "I told you so.

Original: " Ugandan mogul Ashish Thakkar says that Donald Trump is ignorant about Africa" Wrong, if anything it's worse than I say
Cleaned:   Ugandan mogul Ashish Thakkar says that Donald Trump is ignorant about Africa" Wrong, if anything it's worse than I say

Original: Well, it says here it's scheduled "after everyone leaves."
Cleaned:  Well, it says here it's scheduled "after everyone leaves.


In [62]:
final_df.to_csv('data/final_fused_data.csv', index=False)
