<center><h1>Problem Statement 4</h1></center>

In [61]:
import numpy as np
import pandas as pd

## 1. Data Wrangling

In [62]:
df = pd.read_csv('random_sponsorship_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sponsor ID             4000 non-null   int64  
 1   Sponsor Name           4000 non-null   object 
 2   Industry               4000 non-null   object 
 3   Company Size           4000 non-null   object 
 4   Package Type           4000 non-null   object 
 5   Sponsorship Amount     4000 non-null   int64  
 6   Duration (months)      4000 non-null   int64  
 7   Marketing Activities   4000 non-null   object 
 8   Event ID               4000 non-null   int64  
 9   Event Name             4000 non-null   object 
 10  Event Type             4000 non-null   object 
 11  Event Date             4000 non-null   object 
 12  Event Month            4000 non-null   int64  
 13  Attendee Demographics  4000 non-null   object 
 14  ROI Metrics            4000 non-null   float64
 15  Feed

In [63]:
df.head()

Unnamed: 0,Sponsor ID,Sponsor Name,Industry,Company Size,Package Type,Sponsorship Amount,Duration (months),Marketing Activities,Event ID,Event Name,Event Type,Event Date,Event Month,Attendee Demographics,ROI Metrics,Feedback
0,1,Sponsor 1,Retail,Small,Co-Sponsor,46317,11,Social media mentions,101,Event 101,Conference,2025-03-01,3,Tech enthusiasts,4.6,Neutral
1,2,Sponsor 2,Retail,Large,Title Sponsor,62925,3,Booth space,102,Event 102,Exhibition,2025-03-17,3,IT professionals,4.3,Neutral
2,3,Sponsor 3,Wellness,Small,Title Sponsor,41705,9,Logo placement,103,Event 103,Conference,2025-01-10,1,IT professionals,3.2,Neutral
3,4,Sponsor 4,Healthcare,Small,Associate Sponsor,13480,5,Logo placement,104,Event 104,Exhibition,2024-12-09,12,Finance professionals,4.6,Positive
4,5,Sponsor 5,Finance,Medium,Associate Sponsor,43839,2,Logo placement,105,Event 105,Seminar,2025-02-12,2,IT professionals,3.9,Positive


Dropping unnecessary columns

In [64]:
df.drop(['Sponsor ID', 'Sponsor Name', 'Event ID', 'Event Name', 'Event Date'], axis=1, inplace=True)

In [65]:
df.head()

Unnamed: 0,Industry,Company Size,Package Type,Sponsorship Amount,Duration (months),Marketing Activities,Event Type,Event Month,Attendee Demographics,ROI Metrics,Feedback
0,Retail,Small,Co-Sponsor,46317,11,Social media mentions,Conference,3,Tech enthusiasts,4.6,Neutral
1,Retail,Large,Title Sponsor,62925,3,Booth space,Exhibition,3,IT professionals,4.3,Neutral
2,Wellness,Small,Title Sponsor,41705,9,Logo placement,Conference,1,IT professionals,3.2,Neutral
3,Healthcare,Small,Associate Sponsor,13480,5,Logo placement,Exhibition,12,Finance professionals,4.6,Positive
4,Finance,Medium,Associate Sponsor,43839,2,Logo placement,Seminar,2,IT professionals,3.9,Positive


In [66]:
df.columns

Index(['Industry', 'Company Size', 'Package Type', 'Sponsorship Amount',
       'Duration (months)', 'Marketing Activities', 'Event Type',
       'Event Month', 'Attendee Demographics', 'ROI Metrics', 'Feedback'],
      dtype='object')

In [67]:
df['Industry'].unique()

array(['Retail', 'Wellness', 'Healthcare', 'Finance', 'Automotive',
       'Technology'], dtype=object)

In [68]:
df['Marketing Activities'].unique()

array(['Social media mentions', 'Booth space', 'Logo placement',
       'Promotional materials'], dtype=object)

Label encoding our columns

In [69]:
# Label encode every column that is of dtype object
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
        print(le.classes_)

df.info()

['Automotive' 'Finance' 'Healthcare' 'Retail' 'Technology' 'Wellness']
['Large' 'Medium' 'Small']
['Associate Sponsor' 'Co-Sponsor' 'Title Sponsor']
['Booth space' 'Logo placement' 'Promotional materials'
 'Social media mentions']
['Conference' 'Exhibition' 'Seminar' 'Workshop']
['Finance professionals' 'General public' 'Healthcare providers'
 'IT professionals' 'Tech enthusiasts']
['Negative' 'Neutral' 'Positive']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Industry               4000 non-null   int32  
 1   Company Size           4000 non-null   int32  
 2   Package Type           4000 non-null   int32  
 3   Sponsorship Amount     4000 non-null   int64  
 4   Duration (months)      4000 non-null   int64  
 5   Marketing Activities   4000 non-null   int32  
 6   Event Type             4000 non-null   int32  
 7   E

Scaling and splitting the dataset

In [70]:
X = df.drop('ROI Metrics', axis=1)
y = df['ROI Metrics']

In [71]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3, random_state=42)
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size=0.5, random_state=42)

## 2. Modelling

### 2.1. Linear Regression

In [73]:
# Train a linear regression model
from sklearn.linear_model import LinearRegression

np.random.seed(40)

lr = LinearRegression()
lr.fit(X_train, y_train)

In [74]:
# check the performance of the model on cv set
from sklearn.metrics import mean_squared_error

y_cv_pred = lr.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8035384011977552


### 2.2. Random Forest Regressor

In [75]:
# random forest regressor
from sklearn.ensemble import RandomForestRegressor

np.random.seed(40)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [76]:
# check the performance of the model on cv set
y_cv_pred = rf.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8391049033333333


### 2.3. Gradient Boosting Regressor

In [77]:
# gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(40)

gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

In [78]:
# check the performance of the model on cv set
y_cv_pred = gb.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8114948023605183


### 2.4. Cat Boost Regressor

In [79]:
# catboost regressor
from catboost import CatBoostRegressor

np.random.seed(40)

cb = CatBoostRegressor(iterations=100, random_state=42, verbose=False)
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x2526dbf0510>

In [80]:
# check the performance of the model on cv set
y_cv_pred = cb.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8913227102463903


### 2.5. Basic Neural Network

In [81]:
# Create NN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

np.random.seed(40)

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                704       
                                                                 
 dense_7 (Dense)             (None, 64)                4160      
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4929 (19.25 KB)
Trainable params: 4929 (19.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [82]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_cv, y_cv))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x25271119990>

In [83]:
# check the performance of the model on cv set
y_cv_pred = model.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.9187186617037632


In [84]:
# test lr model on test set
y_test_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.7310859824624896


Hyperparameter tuning

In [85]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

params = {
    'alpha': [0.1, 1, 10, 100]
}

ridge = Ridge()
grid = GridSearchCV(ridge, param_grid=params, cv=5)
grid.fit(X_train, y_train)

In [86]:
# check the performance of the model on cv set
y_cv_pred = grid.predict(X_cv)
mse = mean_squared_error(y_cv, y_cv_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.8031250373215878


In [87]:
# get best params
grid.best_params_

{'alpha': 100}

In [88]:
# test ridge model on test set
y_test_pred = grid.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.7309905157485713


In [89]:
# Create final model
ridge = Ridge(alpha=100)
ridge.fit(X_train, y_train)

The reason we chose linear regression model again is the same reason as in problem statement 3 - our synthetic dataset when plotted forms a rectangle.

### Saving Model and Importing Outputs

In [90]:
# save the model
import joblib

joblib.dump(ridge, 'model.pkl')

['model.pkl']

In [91]:
# save the dataset
df.to_csv('cleaned_data.csv', index=False)

In [92]:
# predict on one test data
test_data = X_test[0].reshape(1, -1)    
y_pred = ridge.predict(test_data)
print('Predicted ROI Metrics:', y_pred[0])
print('Actual ROI Metrics:', y_test.iloc[0])

Predicted ROI Metrics: 3.543125724325859
Actual ROI Metrics: 4.9


In [93]:
# save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']