#### Importing Required libraries

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('acq-data.csv')
df.head()

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",126.62
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",59.86
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",84.16
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
4,Breakfast Foods,Frozen Foods,Food,4.08,1.428,3.0,Double Down Sale,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,Radio,50.79


In [3]:
df.columns

Index(['food_category', 'food_department', 'food_family',
       'store_sales(in millions)', 'store_cost(in millions)',
       'unit_sales(in millions)', 'promotion_name', 'sales_country',
       'marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at home(approx)',
       'avg. yearly_income', 'num_children_at_home',
       'avg_cars_at home(approx).1', 'brand_name', 'SRP', 'gross_weight',
       'net_weight', 'recyclable_package', 'low_fat', 'units_per_case',
       'store_type', 'store_city', 'store_state', 'store_sqft', 'grocery_sqft',
       'frozen_sqft', 'meat_sqft', 'coffee_bar', 'video_store', 'salad_bar',
       'prepared_food', 'florist', 'media_type', 'cost'],
      dtype='object')

This data set doesn't contains missing values and duplicates, This data is already in clean format, To solve the problem further the only thing that required is to handle categorical data for processing. For data set click [Here](https://www.kaggle.com/code/mayurspawar/customer-acquisition-prediction/input).

##### Handling Categorical Data and Applying Multiple Regression Model.
1. Linear Regression
2. Ridge Regression
3. Lasso Regression 
4. Support Vector Regression (SVR) 
5. Decision Tree Regression
6. Random Forest Regression
7. Gradient Boosting Regression 
8. K-Nearest Neighbors (KNN) Regression 
9. Polynomial Regression

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
X = df.drop('cost', axis=1, inplace = False)
y = df['cost']

In [6]:
X.shape, y.shape

((60428, 39), (60428,))

In [7]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Categorical Encoding
# Colllecting all categorical columns
cat_cols=[cols for cols in df.columns if df[cols].dtypes=="O"]
len(cat_cols)

17

In [9]:
# Feature Scaling
# Collecting numerical columns in a list
num_cols=[cols for cols in df.columns if df[cols].dtypes!='O']
len(num_cols)

23

In [None]:
# Creating a column transformer to apply different preprocessing to categorical and non-categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('Categorical', OrdinalEncoder(), cat_cols)
    ])

In [11]:
# Defining a list of regression models to include in the pipeline
regressor_list = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge(alpha=1.0)),
    ('Lasso Regression', Lasso(alpha=1.0)),
    ('SVR', SVR(kernel='linear')),
    ('Decision Tree Regression', DecisionTreeRegressor()),
    ('Random Forest Regression', RandomForestRegressor(n_estimators=30)),
    ('Gradient Boosting Regression', GradientBoostingRegressor(n_estimators=30)),
    ('KNN Regression', KNeighborsRegressor(n_neighbors=3)),
    ('Polynomial Regression', Pipeline([
        ('poly_features', PolynomialFeatures(degree=2)),
        ('linear_regression', LinearRegression())
    ]))
]

In [12]:
# Creating a pipeline that combines preprocessing and regression
pipelines = []
for model_name, regressor in regressor_list:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])
    pipelines.append((model_name, pipeline))


In [None]:
# Iterate through the list of pipelines, fit, and evaluate each model
for model_name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    prf_score = pipeline.score(X_test,y_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} - Mean Squared Error: {mse}")
    print(f"{model_name} - Score: {prf_score}")
    print('--------------------------------------------')

Linear Regression - Mean Squared Error: 890.1486889137399
Linear Regression - Score: 0.015505205644646303
--------------------------------------------
Ridge Regression - Mean Squared Error: 890.148691297422
Ridge Regression - Score: 0.015505203008319701
--------------------------------------------
Lasso Regression - Mean Squared Error: 894.9165320689885
Lasso Regression - Score: 0.01023202283248803
--------------------------------------------


By evaluating the Models performance, Random forest reggression and Decision tree regression performed well. Therefore i am going to dump Random Forest Model for future predictions.

In [10]:
ct=ColumnTransformer([
    ('Step1',OrdinalEncoder(),cat_cols)
])
# Pipeline Building
pipeline=Pipeline([
    ('Coltf_step',ct),
    ('Decision tree',RandomForestRegressor(n_estimators=30))
])

In [11]:
pipeline.fit(X_train,y_train)

In [12]:
# Train data accuracy
pipeline.score(X_train,y_train)

0.9998033427397917

In [13]:
# Test data accuracy
pipeline.score(X_test,y_test)

0.9986583961640639

In [14]:
# Mean square Error
y_pred = pipeline.predict(X_test)
mean_squared_error(y_test, y_pred)

1.2130352567096303

In [15]:
# Analyzing outputs
y_test[0:5]

59691    111.70
54554    145.60
20440     92.57
25809     68.97
14872     86.21
Name: cost, dtype: float64

In [16]:
y_pred[0:5]

array([111.7 , 145.6 ,  92.57,  68.97,  86.21])

##### Dumping Model for making predictions im future

In [None]:
import joblib

In [None]:
# Specify the filename where you want to save the pipeline
filename = 'CAP_model.joblib'

# Use joblib to dump the pipeline to the specified file
joblib.dump(pipeline, filename)

In [None]:
# Loading the pretrained model and making predictions
Model = joblib.load('CAP_model.joblib')

In [None]:
dump_pred = Model.predict(X_test)
mean_squared_error(y_test, y_pred)