In [1]:
#functions created with ChatGPT

In [2]:
#created using chat gpt
def plot_histogram(data, bins=30, title="Histogram", xlabel="Value", ylabel="Frequency", color='skyblue', alpha=0.8, max_xticks=10):
    """
    Plots a histogram using the fivethirtyeight style.
    
    Parameters:
        data (list or np.ndarray): The data for the histogram.
        bins (int or sequence): Number of bins or bin edges.
        title (str): Title of the histogram.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        color (str): Color of the bars.
        alpha (float): Transparency of the bars.
        max_xticks (int): Maximum number of x-axis ticks to display.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Create the histogram
    plt.figure(figsize=(8, 6))
    counts, edges, patches = plt.hist(data, bins=bins, color=color, alpha=alpha, edgecolor='black')
    
    # Set title and labels
    plt.title(title, fontsize=16, weight='bold')
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    
    # Adjust x-axis ticks to display a readable number
    tick_positions = np.linspace(edges[0], edges[-1], max_xticks)
    plt.xticks(tick_positions, rotation=45, fontsize=10)
    
    # Add grid and show the plot
    plt.grid(visible=True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [3]:
def plot_scatter(x, y, title="Scatter Plot", xlabel="X-axis", ylabel="Y-axis", color='blue', alpha=0.8, size=50):
    """
    Plots a scatter plot using the fivethirtyeight style with x-axis ticks spaced evenly by year.
    
    Parameters:
        x (list or np.ndarray): Data for the x-axis (should be datetime or convertible to datetime).
        y (list or np.ndarray): Data for the y-axis.
        title (str): Title of the scatter plot.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        color (str or list): Color of the points.
        alpha (float): Transparency of the points.
        size (float or list): Size of the points.
    """
        
    # Ensure x is in datetime format
    if not isinstance(x[0], datetime):
        x = [datetime.strptime(str(date), "%Y-%m-%d") if isinstance(date, str) else date for date in x]
        
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(x, y, c=color, alpha=alpha, s=size, edgecolor='black', linewidth=0.5)
    
    # Set title and labels
    plt.title(title, fontsize=16, weight='bold')
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    
    # Adjust x-axis ticks to show years
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.YearLocator())  # Set major ticks at yearly intervals
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format ticks as years (YYYY)
    plt.xticks(rotation=45, fontsize=10)  # Rotate ticks for better readability

    # Add grid and show the plot
    plt.grid(visible=True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


In [4]:
def plot_boxplot(df, title="Boxplot", xlabel="Categories", ylabel="Values", color='skyblue'):
    """
    Plots a boxplot from a pandas DataFrame using the fivethirtyeight style.
    
    Parameters:
        df (pandas.DataFrame): Input DataFrame where each column represents a category.
        title (str): Title of the boxplot.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        color (str or list of str): Color for the box fill (can be a single color or a list of colors).
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Create the boxplot
    plt.figure(figsize=(10, 6))
    box = plt.boxplot([df[col].dropna() for col in df.columns], patch_artist=True, notch=True, vert=True, labels=df.columns)
    
    # Set colors for boxes
    if isinstance(color, str):
        color = [color] * len(df.columns)  # Use the same color for all boxes if a single color is provided
    for patch, c in zip(box['boxes'], color):
        patch.set_facecolor(c)
        patch.set_edgecolor('black')
        patch.set_linewidth(1.2)
    
    # Customize outliers
    for flier in box['fliers']:
        flier.set(marker='o', color='red', alpha=0.5)
    
    # Set whisker and cap styles
    for whisker in box['whiskers']:
        whisker.set(color='black', linewidth=1.2, linestyle='--')
    for cap in box['caps']:
        cap.set(color='black', linewidth=1.2)
    
    # Set title and labels
    plt.title(title, fontsize=16, weight='bold')
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    
    # Add grid and show plot
    plt.grid(visible=True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [5]:
def plot_scatter_subplots(df, title="Scatter Plots", figsize=(12, 16), color='blue', alpha=0.8, size=50):
    """
    Generates scatter plots for all column pairs in a DataFrame and arranges them in subplots stacked on top of each other using fivethirtyeight style.
    The x-axis will display the years extracted from the datetime index.
    
    Parameters:
        df (pandas.DataFrame): Input DataFrame. Each column represents a variable.
        title (str): Title for the entire figure.
        figsize (tuple): Figure size (width, height). Increased height for larger subplots.
        color (str or list): Color for the points.
        alpha (float): Transparency of the points.
        size (float): Size of the points.
    """
    # Use fivethirtyeight style
    plt.style.use('fivethirtyeight')
    
    # Number of columns in the DataFrame
    cols = df.columns
    num_cols = len(cols)
    
    # Create the subplots (stacked vertically in a single column)
    nrows = num_cols  # One subplot per column
    ncols = 1  # Only one column of subplots
    
    # Create the figure and axes
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    fig.suptitle(title, fontsize=16, weight='bold')
    
    # Ensure axes is iterable even when there's only one subplot
    if nrows == 1:
        axes = [axes]
    
    # Extract years from the datetime index
    df['Year'] = df.index.year
    
    for i, col in enumerate(cols):
        ax = axes[i]
        # Scatter plot for each column against the index (year)
        ax.scatter(df['Year'], df[col], c=color, alpha=alpha, s=size, edgecolor='black', linewidth=0.5)
        
        # Set axis labels and title
        ax.set_title(f"Scatter: {col}", fontsize=12)
        ax.set_xlabel("Year", fontsize=10)
        ax.set_ylabel(col, fontsize=10)
        ax.grid(visible=True, linestyle='--', alpha=0.7)
        
        # Set x-ticks to display years only
        ax.set_xticks(df['Year'].unique())  # Set x-ticks to unique years
        ax.set_xticklabels(df['Year'].unique(), rotation=45)  # Rotate x-ticks for readability
    
    # Adjust layout for better spacing
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Leave space for the main title
    plt.show()


In [6]:
#docstring created with chatgpt
def prophet_format(data, target_col):
    """
    Converts a DataFrame with a sorted datetime index and target column 
    into a format compatible with the Prophet model.

    Parameters:
    ----------
    data : pandas.DataFrame
        Input DataFrame containing a datetime index and target column.
    target_col : str, optional
        Name of the target column in the input DataFrame.

    Returns:
    -------
    pandas.DataFrame
        A DataFrame formatted for Prophet input, with two columns:
        - 'ds': datetime index from the input DataFrame.
        - 'y': values from the specified target column.
    """
    # Create DataFrame for Prophet input
    temp_df = pd.DataFrame()
    
    temp_df['y'] = data[target_col]
    temp_df['ds'] = pd.to_datetime(data.index, format = '%Y')
    temp_df.reset_index(inplace =True, drop =True)

    return temp_df

In [1]:
#function takes in a contributions dataFrame 
#train_end a year the training data finishes on
#test_range of years for the prediction to be compared against
#scaler for the y variable, log, root, cube
#party: the party being examined, 'liberal', 'conservative' etc
def get_tts(data, party, election_years, train_end, test_end, scaler = 'None',  lags = 0):

    #filter dataset into party groups
    party_contributions = data.loc[data['political_party_of_recipient'] ==
    party].groupby('year')[['total_contribution']].sum()

    #create list with election years binary
    election_years_list = []
    
    #loop through df and create a list with 1 for row in an election year and 0 if not
    for y in party_contributions.index:
        if y in election_years:
            election_years_list.append(1)
        else:
            election_years_list.append(0)
    party_contributions['election_year'] = election_years_list

    #create a column where all entries are 1
    party_contributions['pre_bill_c-24'] = 1
    #change entries 2004 and later to 0
    party_contributions.loc['2004':, 'pre_bill_c-24'] = 0

    #apply scaler to the y_variable
    if scaler == 'log':
        party_contributions['total_contribution'] = \
        np.log(party_contributions['total_contribution']) 
    elif scaler == 'log10':
        party_contributions['total_contribution'] = \
        np.log10(party_contributions['total_contribution']) 
    elif scaler == 'root':
        party_contributions['total_contribution'] = \
        np.sqrt(party_contributions['total_contribution']) 
    elif scaler == 'cube':
        party_contributions['total_contribution'] = \
        np.cbrt(party_contributions['total_contribution'])

    if lags > 0:
        #add lag columns to the data
        for n in range(1,lags):
            party_contributions[f'lag_{n}'] = party_contributions['total_contribution'].shift(n)
    
    #remove Nan values created by lags
    party_contributions.dropna(inplace=True)
    
    #Train_Test_split
    X_train = party_contributions.loc[: str(train_end)].drop(columns = ['total_contribution'])
    y_train = party_contributions.loc[: str(train_end), 'total_contribution']
    X_test = party_contributions.loc[str(train_end+1): str(test_end)].drop(columns = ['total_contribution'])
    y_test = party_contributions.loc[str(train_end+1): str(test_end), 'total_contribution']
    
    return X_train, y_train, X_test, y_test

In [5]:
#Take in a train test split and fit a linear model
#input_df to add results from the model as a row in input_df
#scaler input takes in the scaler used in creating the tts and unscales resulting predictions
#party input fills the party column in the output

def lr_results(X_train, y_train, X_test, y_test, input_df, party, scaler = None, model_name = None ):

    #instantaite model and fit on training data
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    #unscale results if scaler was used for inputs
    if scaler == None:
        preds = lr.predict(X_test)
    elif scaler == 'log':
        preds = np.exp(lr.predict(X_test))
        y_test = np.exp(y_test)
        y_train = np.exp(y_train)

    #get rmse and mae
    rmse = round(root_mean_squared_error(y_test, preds), 0)
    mae = round(mean_absolute_error(y_test, preds),0)

    #place all results into a dictionary
    res = {'model_name': model_name,
            'model': 'lr',
           'party': party,
           'scaler' : scaler, 
           'rmse': rmse, 
           'mae':mae, 
            'preds': preds}

    #create df with results
    res_df = pd.DataFrame([res])
    
    #return dictionary of results
    return res_df
    

In [7]:
def extract_preds(df, model_name):
    preds= df.loc[df['model_name'] == model_name]
    preds = preds['preds'].values[0]

    return preds