In [46]:
# Load packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor

In [47]:
# Load data
DATA_PATH ="../data/raw/df1.pkl"
df = pd.read_pickle(DATA_PATH)


In [48]:
# let's have some scaling
standard_scaler = StandardScaler()
df = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

In [49]:
# spseparate X and Y , 'Sales' is the target variable
X = df[['Digital', 'Social_Media', 'Influencer_Marketing', 'Authenticity']]
y = df['Sales']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [50]:
# We'll start with a simple Linear Regression model as a baseline to establish a basic understanding
model = LinearRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

score = model.score(X_test, y_test)

print(f"Model Score: {score:.2f}%")

# Calculate Mean Squared Error (MSE) as the evaluation metric
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)

Model Score: 0.79%
Mean Squared Error (MSE): 0.19461105296215594


## Oh, The model score of 0.79% indicates a very low accuracy

In [52]:
# let's try to use more models 
def MLmodels(X, y):
    
    # Define a list of models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'Support Vector Machine': SVR(),
    }

    # Train and evaluate each model
    results = {'Model': [],'Score':[], 'Mean Squared Error': []}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = model.score(X_test, y_test)
        mse = mean_squared_error(y_test, y_pred)
        results['Model'].append(name)
        results['Score'].append(score)
        results['Mean Squared Error'].append(mse)

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    return results_df

results = MLmodels(X, y)
results


Unnamed: 0,Model,Score,Mean Squared Error
0,Linear Regression,0.785371,0.194611
1,Ridge Regression,0.785479,0.194512
2,Lasso Regression,-0.022377,0.927021
3,Random Forest,0.74124,0.234626
4,Gradient Boosting,0.756397,0.220883
5,Support Vector Machine,0.749589,0.227056


In [55]:
# let's use Ensemble Methods
linear_model = LinearRegression()
ridge_model = Ridge()
lasso_model = Lasso()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()
gb_model = GradientBoostingRegressor()
svr_model = SVR()

# Create a stacking ensemble with multiple models
estimators = [
    ('Linear Regression', linear_model),
    ('Ridge Regression', ridge_model),
    ('Lasso Regression', lasso_model),
    ('Decision Tree', tree_model),
    ('Random Forest', forest_model),
    ('Gradient Boosting', gb_model),
    ('Support Vector Machine', svr_model)
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

# Train and evaluate the stacking model
stacking_model.fit(X_train, y_train)
stacking_score = stacking_model.score(X_test, y_test)
stacking_score

0.7858358586655032

In [56]:
# Define individual models with hyperparameters
linear_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=1.0)
tree_model = DecisionTreeRegressor(max_depth=5)
forest_model = RandomForestRegressor(n_estimators=100, max_depth=10)
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
svr_model = SVR(C=1.0, epsilon=0.2)

# Choose a final estimator with hyperparameters
final_estimator = LinearRegression()

stacking_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

# Train and evaluate the stacking model
stacking_model.fit(X_train, y_train)
stacking_score = stacking_model.score(X_test, y_test)
stacking_score

0.7858776641465458

In [57]:
# Define individual models with hyperparameters to tune
linear_model = LinearRegression()
ridge_model = Ridge()
lasso_model = Lasso()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()
gb_model = GradientBoostingRegressor()
svr_model = SVR()

# Define hyperparameter grids for each model
param_grid = {
    'Linear Regression': {},
    'Ridge Regression': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso Regression': {'alpha': [0.1, 1.0, 10.0]},
    'Decision Tree': {'max_depth': [None, 5, 10, 15]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10, 15]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'Support Vector Machine': {'C': [0.1, 1.0, 10.0], 'epsilon': [0.1, 0.2, 0.5]}
}


# Choose a final estimator with hyperparameters
final_estimator = LinearRegression()

# Create a dictionary to store best parameters for each model
best_params = {}

# Grid search for each model
for name, model in estimators:
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_params[name] = grid_search.best_params_

# Use best parameters to create models
estimators_with_params = [(name, model.set_params(**best_params[name])) for (name, model) in estimators]

stacking_model = StackingRegressor(estimators=estimators_with_params, final_estimator=final_estimator)

# Train and evaluate the stacking model
stacking_model.fit(X_train, y_train)
stacking_score = stacking_model.score(X_test, y_test)
print(stacking_score)

# Display best parameters
best_params



0.7888802758925013


{'Linear Regression': {},
 'Ridge Regression': {'alpha': 1.0},
 'Lasso Regression': {'alpha': 0.1},
 'Decision Tree': {'max_depth': 5},
 'Random Forest': {'max_depth': 5, 'n_estimators': 100},
 'Gradient Boosting': {'learning_rate': 0.1,
  'max_depth': 3,
  'n_estimators': 50},
 'Support Vector Machine': {'C': 1.0, 'epsilon': 0.2}}

## The accuracy is very low (0.79) because we generate this data and there are no strong relationships between them. We'll work on improving this later, or make changes to the features.