In [None]:
#doing things with data
import pandas as pd
import itertools
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import math

import matplotlib.pyplot as plt
import seaborn as sns

#import
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LassoLars
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
#import r2
from sklearn.metrics import r2_score
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# Exploration

In [None]:
#import csv for exploration
df = pd.read_excel('SB_dataframe_adjusted.xlsx')
df

In [None]:
df.eci

In [None]:
df.gdp_deflated


In [None]:
def split_data(df):
    '''
    Take in a DataFrame and perform a train-test split with a 70/30 ratio.
    Return train and test DataFrames.
    '''
    train, test = train_test_split(df, test_size=0.25, random_state=123)
    return train, test

In [None]:
#split data into train, validate, test
train, test = split_data(df)
train.head()

In [None]:
def plot_that_target(train):
    """
    Visualize the target variable.

    Parameters:
    -----------
    train: pandas DataFrame
        
    Returns:
    --------
    countplot of revenue in the train data set
    """
    sns.histplot(data=train, x='adjusted_revenue_S')
    plt.title('Ford revenue in Billions')
    plt.xlabel('Revenue in Billions')
    plt.ylabel('Quarters')
    plt.show

    return

In [None]:
#use function to see
plot_that_target(train)

# Check for normalcy
$H_{0}$ The target is normally distributed.
    
$H_{a}$ The target is not normally distributed.

In [None]:
#check target for normalcy
statistic, p_value = stats.shapiro(train.adjusted_revenue_S)

# Print the test results
print("Shapiro-Wilk Test")
print("Statistic:", statistic)
print("p-value:", p_value)

- based on the p-value being less than 0.05, we reject the null hypothesis and can infer that the target total revenue is not normally distributed.
- using this information, we will not assume normality
    - the requirement for pearson's correlation test is that both variables be normally distributed
    - since our target is non-parametric, we will us the spearman's rank correlation test as our statistical test for all continuous to continuous variable testing.

# View Pairwise relationships

In [None]:
def plot_variable_pairs(train):
    sns.set(style="ticks")
    sns.pairplot(train, kind="reg", corner = True, hue='adjusted_revenue_S', plot_kws={'line_kws': {'color': 'red'}})
    plt.show()

In [None]:
# use functions to see pairwise relationships
plot_variable_pairs(train)

# Uni-variate analysis

In [None]:
def univariate_visual(df):
    '''
    creates histplots for all of my columns
    '''
    num_cols = df.shape[1]
    num_rows = math.ceil(num_cols / 3)
    fig, axes = plt.subplots(num_rows, 3, figsize=(14, 4*num_rows))
    axes = axes.flatten()

    for i, col in enumerate(df.columns):
        sns.histplot(df[col], ax=axes[i])
        axes[i].set_title(col)
        axes[i].tick_params(rotation=45)

    plt.tight_layout()
    plt.show()

In [None]:
#run function to perform visual univariate analysis
univariate_visual(train)

# Bi-variate analysis

# Brent oil price per barrel

Does brent oil price per barrel impact revenue at Starbucks?

$H_{0}$ Brent oil price per barrel does not influence revenue.

$H_{a}$ Brent oil price per barrel does influence revenue.

### Visualize

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['brent_oil'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['brent_oil'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Brent oil price per barrel')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Brent oil price per barrel')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.brent_oil)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by brent oil price per barrel.
- ***this feature will not be moved into modeling***

# Employment Cost Index

Does the Employment Cost Index impact revenue at Starbucks?

$H_{0}$ Employment Cost Index does not influence revenue.

$H_{a}$ Employment Cost Index does influence revenue.

### Visualize

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['eci'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['eci'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. ECI')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('ECI')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.eci)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearmans rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by ECI.
- ***this feature will be moved into modeling***

# Gross domestic product implicit price deflator

Does the Gross domestic product implicit price deflator impact revenue at Starbucks?

$H_{0}$ Gross domestic product implicit price deflator does not influence revenue.

$H_{a}$ Gross domestic product implicit price deflator does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['gdp_deflated'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['gdp_deflated'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. GDP deflated')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('GDP deflated')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.gdp_deflated)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearmans test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by the Gross domestic product implicit price deflator.
- ***this feature will be used in modeling***.

# Consumer Price Index

Does the Consumer Price Index impact revenue at Starbucks?

$H_{0}$ Consumer Price Index does not influence revenue.

$H_{a}$ Consumer Price Index does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['cpi_all_items_avg'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['cpi_all_items_avg'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. CPI')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('CPI')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.cpi_all_items_avg)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by Consumer Price Index.
- ***this feature will be used in modeling.***

# Government spending

Does Government spending impact revenue at Starbucks?

$H_{0}$ Government spending does not influence revenue.

$H_{a}$ Government spending does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['government_spending'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['government_spending'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Government spending')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Government spending')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.government_spending)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by Government spending.
- ***this feature will be used in modeling.***

# Median household income

Does Median household income impact revenue at Starbucks?

$H_{0}$ Median household income does not influence revenue.

$H_{a}$ Median household income does influence revenue.



In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['median_house_income'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['median_house_income'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Median household income')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Median household income')

# Add legend
plt.legend()

# Display the plot
plt.show()

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.median_house_income)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.ease_of_doing_business)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)




- based on the p-value of the spearmans correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by Median household income.
- ***this feature will be used in modeling***











# Federal fund rate

Does the Federal fund rate impact revenue at Ford Motor Company?

$H_{0}$ Federal fund rate does not influence revenue.

$H_{a}$ Federal fund rate does influence revenue.

In [None]:








# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['federal_fund_rate'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['federal_fund_rate'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Federal fund rate')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Federal fund rate')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.federal_fund_rate)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)









- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by Federal fund rate.
- ***this feature will not be used in modeling.***





# Dow Jones Industrial Average








Does the Dow impact revenue at Starbucks?

$H_{0}$ The Dow does not influence revenue.

$H_{a}$ The Dow does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['dow'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['dow'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Dow Jones Industrial Average')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Dow Jones Industrial Average')

# Add legend
plt.legend()

# Display the plot
plt.show()















### Stats test

In [None]:

# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.dow)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)


- based on the p-value of the spearmans correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by the Dow.
- ***this feature will not be used in modeling***


# S&P 500 Average









Does the S&P 500 Average impact revenue at Starbucks?

$H_{0}$ S&P 500 Average does not influence revenue.

$H_{a}$ S&P 500 Average does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['s_and_p'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['s_and_p'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. S&P 500')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('S&P 500')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
train.columns

In [None]:
['population',
       'median_house_income', 'unemp_rate',
       'home_ownership_rate', 'government_spending',
       'gdp_deflated', 'violent_crime_rate',
       'cpi_all_items_avg', 'avg_precipitation',
       'palmer_drought_index', 'eci', 'dow', 's_and_p', 'gas_perc_change',
       'prime', 'gini', 'Man_new_order', 'hdi', 'construction_res', 'cli',
       'auto_loan', 'velocity_of_money', 'loans_and_leases', 'wti', 'soy',
       'brent_oil', 'case_shiller_index', 'number_of_disaster',
       'consumer_confidence_index', 'business_confidence_index',
       'c_e_s_housing', 'c_e_s_health', 'c_e_s_entertainment',
       'ease_of_doing_business', 'wars_started']

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.avg_precipitation)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is not influenced by the S&P 500.
- ***we will not push this feature into our model***

# Gas price change percentage

Does percentage gas price change impact revenue at Starbucks?

$H_{0}$ Percentage gas price change does not influence revenue.

$H_{a}$ Percentage gas price change does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['gas_perc_change'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['gas_perc_change'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Percentage gas price change')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Percentage gas price change')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.gas_perc_change)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by percentage gas price change.
- ***we will not push this feature into our model***

# Prime bank loan rate

Does the Prime bank loan rate impact revenue at Starbucks?

$H_{0}$ Prime bank loan rate does not influence revenue.

$H_{a}$ Prime bank loan rate does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['prime'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['prime'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Prime bank loan rate')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Prime bank loan rate')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.prime)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by the Prime bank loan rate.
- ***we will not push this feature into our model***

# Gini index

Does the Gini index impact revenue at Starbucks?

$H_{0}$ Gini index does not influence revenue.

$H_{a}$ Gini index does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['gini'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['gini'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Gini index')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Gini index')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.gini)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is influenced by the Gini index.
- ***we will not push this feature into our model***

# Manufacturer new orders

Do Manufacturer new orders impact revenue at Starbucks?

$H_{0}$ Manufacturer new orders does not influence revenue.

$H_{a}$ Manufacturer new orders influence revenue.



In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['Man_new_order'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['Man_new_order'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Manufacturer new orders')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Manufacturer new orders')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.Man_new_order)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by manufacturer new orders.
- ***we will push this feature into our model***

# Human development index

Does the Human development index impact revenue at Starbucks?

$H_{0}$ Human development index does not influence revenue.

$H_{a}$ Human development index does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['hdi'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['hdi'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Human development index')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Human development index')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.hdi)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by the human development index.
- ***we will push this feature into our model***

# Residential Construction

Does residential construction impact revenue at Starbucks?

$H_{0}$ Residential construction does not influence revenue.

$H_{a}$ Residential construction does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['construction_res'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['construction_res'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Residential construction')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Residential construction')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.construction_res)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by residential constuction.
- ***we will not push this feature into our model***

# Composite leading indicator

Does the composite leading indicator impact revenue at Starbucks?

$H_{0}$ Composite leading indicator does not influence revenue.

$H_{a}$ Composite leading indicator does influence revenue.


In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['cli'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['cli'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Composite leading indicator')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Composite leading indicator')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.cli)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by the composite leading indicator.
- ***we will not push this feature into our model***

# Auto loan rate

Do auto loan rates impact revenue at Starbucks?

$H_{0}$ Auto loan rates do not influence revenue.

$H_{a}$ Auto loan rates do influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['auto_loan'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['auto_loan'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Auto loan rates')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Auto loan rates')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.auto_loan)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Ford Motor Company revenue is influenced by auto loan rates.
- ***we will push this feature into our model***

# Velocity of money

Does velocity of money impact revenue at Starbucks?

$H_{0}$ Velocity of money does not influence revenue.

$H_{a}$ Velocity of money does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['velocity_of_money'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['velocity_of_money'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Velocity of money')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Velocity of money')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.velocity_of_money)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by the velocity of money.
- ***we will push this feature into our model***

# Loans and leases other than vehicle and real estate

Do loans and leases other than vehicle and real estate impact revenue at Starbucks?

$H_{0}$ Loans and leases other than vehicle and real estate does not influence revenue.

$H_{a}$ Loans and leases other than vehicle and real estate does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['loans_and_leases'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['loans_and_leases'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Loans and leases other than vehicle and real estate')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Loans and leases other than vehicle and real estate')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.loans_and_leases)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by loans and leases other than vehicle and real estate.
- ***we will not push this feature into our model***

# West Texas intermediate oil

Does West Texas intermediate oil impact revenue at Starbucks?

$H_{0}$ West Texas intermediate oil does not influence revenue.

$H_{a}$ West Texas intermediate oil does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['wti'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['wti'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. West Texas intermediate oil')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('West Texas intermediate oil')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.wti)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being less than 0.05, we reject the null hypothesis and can infer that Starbucks revenue is influenced by West Texas intermediate oil.
- ***we will push this feature into our model***

# Soy bean prices

Do soy bean prices impact revenue at Starbucks?

$H_{0}$ Soy bean prices do not influence revenue.

$H_{a}$ Soy bean prices do influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['soy'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['soy'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Soy bean prices')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Soy bean prices')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.soy)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by soy bean prices.
- ***we will not push this feature into our model***

# Misery index

Does the misery index impact revenue at Starbucks?

$H_{0}$ The misery index does not influence revenue.

$H_{a}$ The misery index does influence revenue.

In [None]:
# Scatter plot to visualize the relationship
plt.scatter(train['adjusted_revenue_S'], train['misery_index'], label='Data')

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(train['adjusted_revenue_S'], train['misery_index'])
line = slope * train['adjusted_revenue_S'] + intercept

# Plot the regression line
plt.plot(train['adjusted_revenue_S'], line, color='red', label='Regression Line')

# Set plot title and labels
plt.title('Revenue vs. Misery index')
plt.xlabel('Total Revenue at Starbucks')
plt.ylabel('Misery index')

# Add legend
plt.legend()

# Display the plot
plt.show()

### Stats test

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.unemp_rate)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

- based on the p-value of the spearman's rank correlation test being greater than 0.05, we fail to reject the null hypothesis and can infer that Starbucks revenue is not influenced by the misery index.
- ***we will not push this feature into our model***

#### Exploration Summary
- visualized
- checked target for normalcy to determine parametric or non-parametric testing
    - target is not normally distributed, therefore spearman's rank correlation test was used for all features
- found that xxxx features tested are significant to Starbucks revenue


# Multi-variate analysis


In [None]:
def multivariate_exploration(train, target_variable):
    '''
    Perform different combinations of visualizations to explore relationships with the target variable
    '''

    # Numeric variables for scatter plots
    numeric_vars = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Remove target variable from the list of numeric variables
    numeric_vars.remove(target_variable)

    # Plot scatter plots for numeric variables
    for var in numeric_vars:
        sns.scatterplot(x=var, y=target_variable, data=train)
        plt.xlabel(var)
        plt.ylabel(target_variable)
        plt.title(f'{var} vs. {target_variable}')
        plt.show()

    # Categorical variables for box plots
    categorical_vars = train.select_dtypes(include=['object']).columns.tolist()

    # Plot box plots for categorical variables
    for var in categorical_vars:
        sns.boxplot(x=var, y=target_variable, data=train)
        plt.xlabel(var)
        plt.ylabel(target_variable)
        plt.title(f'{var} vs. {target_variable}')
        plt.show()

    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    corr = train.corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, square=True, cbar_kws={'shrink': 0.8})
    plt.title('Correlation Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

    # Pairplot for selected variables
    selected_vars = numeric_vars[:4]  # Select first 4 numeric variables for pairplot
    selected_vars.append(target_variable)
    sns.pairplot(df[selected_vars], diag_kind='hist', plot_kws={'alpha': 0.5})
    plt.show()

In [None]:
multivariate_exploration(train, 'adjusted_revenue_B')

In [None]:
# Run Spearman's rank correlation test
correlation, p_value = stats.spearmanr(train.adjusted_revenue_S, train.wars_started)

# Print the test results
print("Spearman's Rank Correlation Test")
print("Correlation Coefficient:", correlation)
print("p-value:", p_value)

# prepare data for modeling

In [None]:
this_is_it = pd.read_csv('values_for_prediction_ford_adjusted.csv')
this_is_it

In [None]:
starbucks_revenue_prediction = this_is_it[['population','median_house_income', 'unemp_rate',
                                           'home_ownership_rate', 'government_spending',
                                           'gdp_deflated','violent_crime_rate',
                                           'cpi_all_items_avg','eci', 'dow', 's_and_p', 
                                           'Man_new_order', 'hdi', 'auto_loan', 'velocity_of_money', 
                                           'wti', 'brent_oil', 'case_shiller_index', 'number_of_disaster',
                                           'c_e_s_housing', 'c_e_s_health','ease_of_doing_business']]
starbucks_revenue_prediction

In [None]:
def scaled_df(train, test, starbucks_revenue_prediction):
    """
    This function scales the train, validate, and test data using the MinMaxScaler.

    Parameters:
    train (pandas DataFrame): The training data.
    test (pandas DataFrame): The test data.
    ford_revenue_prediction (pandas DataFrame): The data for Ford revenue prediction.

    Returns:
    Tuple of:
        X_train_scaled (pandas DataFrame): The scaled training data.
        X_test_scaled (pandas DataFrame): The scaled test data.
        y_train (pandas Series): The target variable for the training data.
        y_test (pandas Series): The target variable for the test data.
        X_train (pandas DataFrame): The original training data.
        ford_revenue_prediction_scaled (pandas DataFrame): The scaled Ford revenue prediction data.
    """

    X_train = train[['population','median_house_income', 'unemp_rate',
                     'home_ownership_rate', 'government_spending',
                     'gdp_deflated','violent_crime_rate',
                     'cpi_all_items_avg','eci', 'dow', 's_and_p', 
                     'Man_new_order', 'hdi', 'auto_loan', 'velocity_of_money', 
                     'wti', 'brent_oil', 'case_shiller_index', 'number_of_disaster',
                     'c_e_s_housing', 'c_e_s_health','ease_of_doing_business']]
    X_test = test[['population','median_house_income', 'unemp_rate',
                   'home_ownership_rate', 'government_spending',
                   'gdp_deflated','violent_crime_rate',
                   'cpi_all_items_avg','eci', 'dow', 's_and_p', 
                   'Man_new_order', 'hdi', 'auto_loan', 'velocity_of_money', 
                   'wti', 'brent_oil', 'case_shiller_index', 'number_of_disaster',
                   'c_e_s_housing', 'c_e_s_health','ease_of_doing_business']]

    y_train = train.adjusted_revenue_S
    y_test = test.adjusted_revenue_S

    # Making our scaler
    scaler = MinMaxScaler()
    
    # Fitting our scaler and using it to transform train and test data
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),
                                  columns=X_train.columns,
                                  index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test),
                                 columns=X_test.columns,
                                 index=X_test.index)
    
    # Scaling the Ford revenue prediction data
    starbucks_revenue_prediction_scaled = pd.DataFrame(scaler.transform(starbucks_revenue_prediction.values.reshape(1, -1)),
                                                  columns=starbucks_revenue_prediction.columns,
                                                  index=starbucks_revenue_prediction.index)

    return X_train_scaled, X_test_scaled, y_train, y_test, X_train, starbucks_revenue_prediction_scaled


In [None]:
X_train_scaled, X_test_scaled, y_train, y_test, X_train, starbucks_revenue_prediction_scaled = scaled_df(train, test, starbucks_revenue_prediction)
X_train_scaled.head()

In [None]:
starbucks_revenue_prediction_scaled 

In [None]:
def metrics_reg(y, yhat):
    """
    send in y_true, y_pred & returns RMSE, R2
    """
    rmse = mean_squared_error(y, yhat, squared=False)
    r2 = r2_score(y, yhat)
    return rmse, r2

In [None]:
def baseline_model(train, y_train):
    """
    Creates a baseline model using the mean of the target variable and evaluates its performance.

    Parameters:
        train (pandas DataFrame): The training data containing the feature variables.
        y_train (pandas Series): The target variable for the training data.

    Returns:
        pandas DataFrame: A DataFrame containing the evaluation metrics of the baseline model.

    The function creates a baseline model by setting the predicted value as the mean of the target variable (y_train).
    It calculates the root mean squared error (RMSE) and R^2 score of the baseline model using the y_train values
    and an array filled with the mean value. The RMSE and R^2 score are added to a DataFrame for comparison.

    Additionally, the function prints the baseline value and returns the DataFrame with the evaluation metrics.
    """
    #set baseline
    baseline = round(y_train.mean(),2)

    #make an array to send into my mean_square_error function
    baseline_array = np.repeat(baseline, len(train))

    # Evaluate the baseline rmse and r2
    rmse, r2 = metrics_reg(y_train, baseline_array)

    # add results to a dataframe for comparison
    metrics_df = pd.DataFrame(data=[
    {
        'model':'Baseline',
        'rmse':rmse,
        'r2':r2
    }
    ])
    
    # print baseline
    baseline = round(y_train.mean(),2)
    print(f' Baseline mean is : {baseline}')
    return metrics_df

In [None]:
metrics_df = baseline_model(train, y_train)
metrics_df

In [None]:
def multiple_regression(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df):
    """
    Performs multiple regression using Recursive Feature Elimination (RFE) and evaluates the model's performance.

    Parameters:
        X_train_scaled (pandas DataFrame): The scaled feature variables of the training data.
        X_test_scaled (pandas DataFrame): The scaled feature variables of the test data.
        y_train (pandas Series): The target variable for the training data.
        y_test (pandas Series): The target variable for the test data.
        metrics_df (pandas DataFrame): A DataFrame to store the evaluation metrics.

    Returns:
        pandas DataFrame: The updated metrics DataFrame with the evaluation metrics of the multiple regression model.
    """
    # Define the model
    model = LinearRegression()

    # Create the RFE object
    rfe = RFE(estimator=model, n_features_to_select=15)  

    # Fit the RFE object to the training data
    rfe.fit(X_train_scaled, y_train)

    # Get the selected features
    selected_features = X_train_scaled.columns[rfe.support_].tolist()

    # Transform the data using the selected features
    X_train_rfe = rfe.transform(X_train_scaled)
    X_test_rfe = rfe.transform(X_test_scaled)

    # Fit the model on the transformed training data
    model.fit(X_train_rfe, y_train)

    # Make predictions on the transformed test data
    pred_test_OLS = model.predict(X_test_rfe)

    # Evaluate the model on the test data
    rmse, r2 = metrics_reg(y_test, pred_test_OLS)

    # Add evaluation metrics to the provided metrics DataFrame
    metrics_df.loc[1] = ['Multiple Regression', rmse, r2]

    return metrics_df, selected_features, model




In [None]:
metrics_df, selected_features, model = multiple_regression(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df)
metrics_df

In [None]:
def LassoLars_model(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df):
    """
    Performs LassoLars regression and evaluates the model's performance.

    Parameters:
        X_train_scaled (pandas DataFrame): The scaled feature variables of the training data.
        X_test_scaled (pandas DataFrame): The scaled feature variables of the test data.
        y_train (pandas Series): The target variable for the training data.
        y_test (pandas Series): The target variable for the test data.
        metrics_df (pandas DataFrame): A DataFrame to store the evaluation metrics.

    Returns:
        pandas DataFrame: The updated metrics DataFrame with the evaluation metrics of the LassoLars model.
    """
    # Define the model and the hyperparameter grid
    model = LassoLars(normalize=False)
    param_grid = {
        'alpha': [0,0.1,0.25,0.5,0.75, 1],  # Example hyperparameter values to search through
        'normalize': [True, False]
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_scaled, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the test data using the best model
    pred_test_lars = best_model.predict(X_test_scaled)

    # Evaluate the model on the test data
    rmse, r2 = metrics_reg(y_test, pred_test_lars)

    # Add evaluation metrics to the provided metrics DataFrame
    metrics_df.loc[2] = ['LassoLars', rmse, r2]

    return metrics_df

In [None]:
LassoLars_model(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df)

In [None]:
def Generalized_Linear_Model(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df):
    """
    Fits a Generalized Linear Model (GLM) and evaluates its performance.

    Parameters:
        X_train_scaled (pandas DataFrame): The scaled feature variables of the training data.
        X_test_scaled (pandas DataFrame): The scaled feature variables of the test data.
        y_train (pandas Series): The target variable for the training data.
        y_test (pandas Series): The target variable for the test data.
        metrics_df (pandas DataFrame): A DataFrame to store the evaluation metrics.

    Returns:
        pandas DataFrame: The updated metrics DataFrame with the evaluation metrics of the GLM.
    """
    # Define the model and the hyperparameter grid
    model = TweedieRegressor()
    param_grid = {
        'alpha': [0, 0.5, 1],  # Example values for alpha
        'power': [0, 1, 2]  # Example values for power
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_scaled, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the test data using the best model
    pred_test_glm = best_model.predict(X_test_scaled)

    # Evaluate the model on the test data
    rmse, r2 = metrics_reg(y_test, pred_test_glm)

    # Add evaluation metrics to the provided metrics DataFrame
    metrics_df.loc[4] = ['Generalized Linear Model', rmse, r2]

    return metrics_df

In [None]:
Generalized_Linear_Model(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def polynomial_regression(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df):
    """
    Performs polynomial regression and evaluates the model's performance.

    Parameters:
        X_train_scaled (pandas DataFrame): The scaled feature variables of the training data.
        X_test_scaled (pandas DataFrame): The scaled feature variables of the test data.
        y_train (pandas Series): The target variable for the training data.
        y_test (pandas Series): The target variable for the test data.
        metrics_df (pandas DataFrame): A DataFrame to store the evaluation metrics.

    Returns:
        pandas DataFrame: The updated metrics DataFrame with the evaluation metrics of the polynomial regression model.
    """
    # Create the pipeline
    pipeline = Pipeline([
        ('polynomialfeatures', PolynomialFeatures()),
        ('linearregression', LinearRegression())
    ])

    # Define the hyperparameter grid
    param_grid = {
        'polynomialfeatures__degree': [1, 2, 3, 4, 5]  
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_scaled, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the test data using the best model
    pred_test_pr = best_model.predict(X_test_scaled)

    # Evaluate the model on the test data
    rmse, r2 = metrics_reg(y_test, pred_test_pr)

    # Add evaluation metrics to the provided metrics DataFrame
    metrics_df.loc[3] = ['Polynomial Regression(PR)', rmse, r2]

    return metrics_df


In [None]:
polynomial_regression(X_train_scaled, X_test_scaled, y_train, y_test, metrics_df)

# Make the prediction

In [None]:
# Pass the preprocessed single line of data to the best_model
pred_value = best_model.predict(starbucks_revenue_prediction_scaled)

# Print the predicted value
print(pred_value)


# Summary 

Last quarter (Q1 2023) Ford's revenue was 41.474B.
- The goal of this project was to predict whether or not Ford's revenue would increase or decrease in the next quarter.
    - Our team wanted to see if economic, socio-economic and evironmental factors could predict revenue gains and losses. 
- We first brainstormed many potential features.
    - We acquired data from over 40 separate data sets and combined them into one CSV for exploration.
    - We discovered many interesting relationships between our various independent variables.
- Through extensive feature selection and meticulous statistical testing, we selected 15 features to send into our regression models.
    - Key factors in making our models produce results that beat baseline RMSE and produced a respectable $R^2$ value
        - Lagging revenue back 1 period
            - Doing this trained the model to use previous quarters data to predict next quarters revenue
            - We also took the 2023 2nd quarter data minus Ford's revenue and set it aside 
                - Timing is everything... Ford will release 2023 Q2 revenue on 27 July. We will be able to compare our prediction to that number. 
        - Adjusting our monetary features for inflation. 
            - We used the Consumer Price Index to perform deflation
        - Used grid search to optimize our hyperparameters in concert with recursive feature elimination
- We compared the results of 4 models
| model                       | rmse      | r2              |
|-----------------------------|-----------|-----------------|
| Baseline                    | 9.458673  | -2.741374e-07   |
| Multiple Regression         | 5.331748  | 7.996339e-01    |
| LassoLars                   | 6.205169  | 7.286111e-01    |
| Generalized Linear Model    | 5.704175  | 7.706649e-01    |
| Polynomial Regression(PR)   | 5.331748  | 7.996339e-0     |


- Polynomial Regression performed similarly to Multiple linear Regression because the best performing hyperparameter was degree = 1. 

- Once we selected the best model. We ran our best model on the set aside Q2 features. 
    - Our prediction is 39.99B in revenue. We are not predicting the actual number but using this figure to compare to last quarter and predict a slight descrease in revenue for Q2.


# Conclusion

# Next Steps