<a href="https://colab.research.google.com/github/RavinduP/Product_Success/blob/Recommendation_System/Models_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
#Load & Clean Data
import pandas as pd
import numpy as np

# Step 1: Load the dataset
data_path = 'data.csv'  # Adjust path as needed
data = pd.read_csv(data_path)

# Step 2: Clean numeric columns
def clean_numeric_column(column):
    # Remove ',' and '%' and convert to numeric
    return pd.to_numeric(column.replace({',': '', '%': '', 'N/A': None}, regex=True), errors='coerce')

numeric_columns = [
    "Sales Revenue - Physical",
    "Online visitors",
    "Page Views",
    "Sales Revenue - Online",
    "Conversion",  # Includes percentage values
    "Reach",
    "Impressions",
    "Clicks",
    "Influencer Campaign Reach",
    "Ad Budget",
    "Discount ",  # Includes percentage values
]

for col in numeric_columns:
    if col in data.columns:
        data[col] = clean_numeric_column(data[col])

# Step 3: Handle missing values
# Fill missing numeric values with mean
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

print("Initial shape:", data.shape)

# Drop rows with missing values in critical categorical columns
categorical_columns = ["Month", "Ad Type", "Campaign", "Campaign Type"]
data.dropna(subset=categorical_columns, inplace=True)
print("After dropping rows with missing critical categorical columns:", data.shape)

# Step 4: Clean 'Month' column (Ensure it's numeric)
data['Month'] = pd.to_numeric(data['Month'], errors='coerce')

# Add temporal features (Cyclic Encoding for Month)
data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)


Initial shape: (32, 18)
After dropping rows with missing critical categorical columns: (10, 18)


In [56]:
print("Discount " in data.columns)

True


In [57]:
print(data["Discount "].isna().sum())

0


In [58]:
print(data.shape)
print(data.head())


(10, 20)
     Year  Month  Sales Revenue - Physical  Sales Quatity - Physical  \
1  2022.0    NaN               11088040.83                     636.0   
2  2022.0    NaN                8652693.61                     596.0   
5  2022.0    NaN               11853049.67                     967.0   
6  2022.0    NaN                5759712.37                     334.0   
7  2022.0    NaN               12831055.37                     678.0   

   Online visitors  Page Views  Sales Revenue - Online  \
1          10408.0     28796.0               8759219.0   
2           8609.0     18496.0               7455614.0   
5          14342.0     32262.0               4028649.0   
6          13167.0     29799.0               4035447.0   
7          23582.0     66166.0              11720494.0   

   Sales Quantity - Online  Conversion                     Ad Type      Reach  \
1                    379.0        3.63  Daraz, Facebook, Instagram  1460995.0   
2                     75.0        0.87  Daraz, 

In [59]:
#Encode Categorical Features
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical columns
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded_cats = encoder.fit_transform(data[categorical_columns])
encoded_cat_columns = encoder.get_feature_names_out(categorical_columns)

# Combine encoded columns with the original data
data = data.drop(categorical_columns, axis=1)
data[encoded_cat_columns] = encoded_cats


In [60]:
print(data.columns.tolist())

['Year', 'Sales Revenue - Physical', 'Sales Quatity - Physical', 'Online visitors', 'Page Views', 'Sales Revenue - Online', 'Sales Quantity - Online', 'Conversion', 'Reach', 'Impressions', 'Clicks', 'Influencer Campaign Reach', 'Ad Budget', 'Discount ', 'Month_sin', 'Month_cos', 'Ad Type_Daraz, Facebook, Instagram, Influencer', 'Campaign Type_Mega']


In [61]:
#Prepare Features and Targets

# Separate features and targets
X = data.drop(["Ad Type_Daraz, Facebook, Instagram, Influencer", "Campaign Type_Mega", "Discount "], axis=1, errors="ignore")
y_ad_type = data["Ad Type_Daraz, Facebook, Instagram, Influencer"]
y_campaign_type = data["Campaign Type_Mega"]
y_discount = data["Discount "]


In [62]:
#Train-Test Split

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

X_train = train_data.drop(["Ad Type_Daraz, Facebook, Instagram, Influencer", "Campaign Type_Mega", "Discount "], axis=1, errors="ignore")
y_ad_train = train_data["Ad Type_Daraz, Facebook, Instagram, Influencer"]
y_campaign_train = train_data["Campaign Type_Mega"]
y_discount_train = train_data["Discount "]

X_test = test_data.drop(["Ad Type_Daraz, Facebook, Instagram, Influencer", "Campaign Type_Mega", "Discount "], axis=1, errors="ignore")
y_ad_test = test_data["Ad Type_Daraz, Facebook, Instagram, Influencer"]
y_campaign_test = test_data["Campaign Type_Mega"]
y_discount_test = test_data["Discount "]


In [63]:
#To check whether data is available for training and testing
print(data['Year'].unique())
print(train_data.shape)
print(test_data.shape)


[2022. 2023.]
(8, 18)
(2, 18)


In [64]:
#Model Training and Evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train Random Forest for Campaign Type
campaign_model = RandomForestClassifier(random_state=42)
campaign_model.fit(X_train, y_campaign_train)

# Evaluate
y_campaign_pred = campaign_model.predict(X_test)
print("Campaign Type Prediction Report:")
print(classification_report(y_campaign_test, y_campaign_pred))


Campaign Type Prediction Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.50      0.67         2

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [65]:
# Train Random Forest for Ad Type
ad_model = RandomForestClassifier(random_state=42)
ad_model.fit(X_train, y_ad_train)

# Evaluate
y_ad_pred = ad_model.predict(X_test)
print("Ad Type Prediction Report:")
print(classification_report(y_ad_test, y_ad_pred))


Ad Type Prediction Report:
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67         1
         1.0       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
print(X_train.isnull().sum())


Year                         0
Sales Revenue - Physical     0
Sales Quatity - Physical     0
Online visitors              0
Page Views                   0
Sales Revenue - Online       0
Sales Quantity - Online      0
Conversion                   0
Reach                        0
Impressions                  0
Clicks                       0
Influencer Campaign Reach    0
Ad Budget                    0
Month_sin                    8
Month_cos                    8
dtype: int64


In [67]:
from sklearn.impute import SimpleImputer

# Fill NaN in X_train/X_test after recalculating
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean')  # Use 'median' or 'most_frequent' if more appropriate
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)


In [68]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Train Gradient Boosting for Discount
discount_model = GradientBoostingRegressor(random_state=42)
discount_model.fit(X_train, y_discount_train)

# Evaluate
y_discount_pred = discount_model.predict(X_test)
print("Discount Prediction Mean Squared Error:", mean_squared_error(y_discount_test, y_discount_pred))


Discount Prediction Mean Squared Error: 116.27889684498186


In [69]:
#Cross-Validation
from sklearn.model_selection import cross_val_score

# Cross-validation for Campaign Type
campaign_cv_scores = cross_val_score(campaign_model, X_train, y_campaign_train, cv=5)
print("Cross-validation scores for Campaign Type:", campaign_cv_scores)

# Cross-validation for Ad Type
ad_cv_scores = cross_val_score(ad_model, X_train, y_ad_train, cv=5)
print("Cross-validation scores for Ad Type:", ad_cv_scores)

# Cross-validation for Discount
discount_cv_scores = cross_val_score(discount_model, X_train, y_discount_train, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation scores for Discount:", -discount_cv_scores)


ValueError: n_splits=5 cannot be greater than the number of members in each class.