<a href="https://colab.research.google.com/github/RavinduP/Product_Success/blob/Recommendation_System/DSGP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset
data_path = 'data.csv'  # Adjust path as needed
data = pd.read_csv(data_path)

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Step 2: Clean numeric columns
def clean_numeric_column(column):
    # Remove ',' and '%' and convert to numeric
    return pd.to_numeric(column.replace({',': '', '%': '', 'N/A': None}, regex=True), errors='coerce')

numeric_columns = [
    "Sales Revenue - Physical",
    "Online visitors",
    "Page Views",
    "Sales Revenue - Online",
    "Conversion",  # Includes percentage values
    "Reach",
    "Impressions",
    "Clicks",
    "Influencer Campaign Reach",
    "Ad Budget",
    "Discount ",  # Includes percentage values
]

for col in numeric_columns:
    if col in data.columns:
        data[col] = clean_numeric_column(data[col])

# Step 3: Handle missing values
# Fill missing numeric values with mean
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Drop rows with missing values in critical categorical columns (if any)
categorical_columns = ["Month", "Ad Type", "Campaign", "Campaign Type"]
data.dropna(subset=categorical_columns, inplace=True)

# Step 4: Encode categorical columns
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded_cats = encoder.fit_transform(data[categorical_columns])
encoded_cat_columns = encoder.get_feature_names_out(categorical_columns)

# Combine encoded columns with original data
data = data.drop(categorical_columns, axis=1)
data[encoded_cat_columns] = encoded_cats

# Step 5: Feature scaling
scaler = MinMaxScaler()
scaled_columns = numeric_columns
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

# Display preprocessed data
data.head()

Unnamed: 0,Year,Sales Revenue - Physical,Sales Quatity - Physical,Online visitors,Page Views,Sales Revenue - Online,Sales Quantity - Online,Conversion,Reach,Impressions,...,Ad Budget,Discount,Month_January,Month_June,Month_May,Month_November,Month_October,Month_September,"Ad Type_Daraz, Facebook, Instagram, Influencer",Campaign Type_Mega
1,2022.0,0.763552,636.0,0.050861,0.143058,0.692264,379.0,1.0,1.0,1.0,...,0.737031,0.6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2022.0,0.433186,596.0,0.0,0.0,0.556793,75.0,0.086093,0.822106,0.542682,...,0.718083,0.6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,2022.0,0.867329,967.0,0.162082,0.191197,0.200662,288.0,0.364238,0.234138,0.316993,...,0.587392,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,2022.0,0.04074,334.0,0.128863,0.156988,0.201369,216.0,0.274834,0.303457,0.383908,...,0.682149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,2022.0,1.0,678.0,0.423313,0.662093,1.0,639.0,0.665563,0.203765,0.238329,...,0.71258,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
print(data.columns)

Index(['Year', 'Sales Revenue - Physical', 'Sales Quatity - Physical',
       'Online visitors', 'Page Views', 'Sales Revenue - Online',
       'Sales Quantity - Online', 'Conversion', 'Reach', 'Impressions',
       'Clicks', 'Influencer Campaign Reach', 'Ad Budget', 'Discount ',
       'Month_January', 'Month_June', 'Month_May', 'Month_November',
       'Month_October', 'Month_September',
       'Ad Type_Daraz, Facebook, Instagram, Influencer', 'Campaign Type_Mega'],
      dtype='object')


In [None]:
# Separate features (X) and target variables (y)
X = data.drop(["Campaign Type_Mega", "Discount "], axis=1)  # Features
y1 = data["Campaign Type_Mega"]  # Target for campaign type
y2 = data["Discount "]  # Target for discount

from sklearn.preprocessing import LabelEncoder

# Encode the target variable for campaign type
campaign_type_encoder = LabelEncoder()
y1_encoded = campaign_type_encoder.fit_transform(y1)

from sklearn.model_selection import train_test_split

# Split for campaign type prediction
X_train, X_test, y1_train, y1_test = train_test_split(X, y1_encoded, test_size=0.2, random_state=42)

# Split for discount prediction
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

#Model for Campaign type prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a classifier for campaign type
campaign_model = RandomForestClassifier(random_state=42)
campaign_model.fit(X_train, y1_train)

# Evaluate the model
y1_pred = campaign_model.predict(X_test)
print(classification_report(y1_test, y1_pred))

# Model for Discount Prediction
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a regressor for discount
discount_model = LinearRegression()
discount_model.fit(X_train, y2_train)

# Evaluate the model
y2_pred = discount_model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y2_test, y2_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67         2

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2

Mean Squared Error: 0.5825543307744645


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Separate features (X) and target variables (y)
X = data.drop(["Campaign Type_Mega", "Discount ", "Ad Type_Daraz, Facebook, Instagram, Influencer"], axis=1)  # Features
y1 = data["Campaign Type_Mega"]  # Target for campaign type
y2 = data["Discount "]  # Target for discount
y3 = data["Ad Type_Daraz, Facebook, Instagram, Influencer"]  # Target for ad type

from sklearn.preprocessing import LabelEncoder

# Encode the target variables
campaign_type_encoder = LabelEncoder()
y1_encoded = campaign_type_encoder.fit_transform(y1)

ad_type_encoder = LabelEncoder()
y3_encoded = ad_type_encoder.fit_transform(y3)

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Stratified split for campaign type prediction
#X_train, X_test, y1_train, y1_test = train_test_split(
 #   X, y1_encoded, test_size=0.2, random_state=42, stratify=y1_encoded
#)

# Increase test set size
X_train, X_test, y1_train, y1_test = train_test_split(
    X, y1_encoded, test_size=0.3, random_state=42, stratify=y1_encoded
)
# Apply SMOTE for campaign type
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_resampled, y1_train_resampled = smote.fit_resample(X_train, y1_train)

# Regular split for discount prediction
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

# Stratified split for ad type prediction
#_, _, y3_train, y3_test = train_test_split(
 #   X, y3_encoded, test_size=0.2, random_state=42, stratify=y3_encoded
#)

# Repeat for ad type prediction
X_train, X_test, y3_train, y3_test = train_test_split(
    X, y3_encoded, test_size=0.3, random_state=42, stratify=y3_encoded
)


# Model for Campaign type prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

#Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid_search.fit(X_train, y1_train)

# Use best model
campaign_model = grid_search.best_estimator_
print("Best parameters for Campaign Model:", grid_search.best_params_)

# Train a classifier for campaign type
campaign_model = RandomForestClassifier(random_state=42)
campaign_model.fit(X_train, y1_train)

# Evaluate the model for campaign type
y1_pred = campaign_model.predict(X_test)
print("Campaign Type Prediction Report:")
print(classification_report(y1_test, y1_pred, zero_division=0))

from sklearn.model_selection import StratifiedKFold, cross_val_score
# Define stratified K-fold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation for campaign type
cv_scores = cross_val_score(campaign_model, X, y1_encoded, cv=stratified_kfold)
print("Stratified Cross-validation scores for Campaign Type:", cv_scores)
print("Mean Stratified CV score for for Campaign Type:", cv_scores.mean())

# Model for Discount Prediction
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a regressor for discount
discount_model = LinearRegression()
discount_model.fit(X_train, y2_train)

# Evaluate the model for discount
y2_pred = discount_model.predict(X_test)
print("Mean Squared Error for Discount:", mean_squared_error(y2_test, y2_pred))

# Cross-validation for discount prediction
cv_scores = cross_val_score(discount_model, X, y2, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation scores for Discount:", -cv_scores)
print("Mean cross-validation score for Discount:", -cv_scores.mean())

# Model for Ad Type Prediction
# Train a classifier for ad type
ad_type_model = RandomForestClassifier(random_state=42)
ad_type_model.fit(X_train, y3_train)

# Evaluate the model for ad type
y3_pred = ad_type_model.predict(X_test)
print("Ad Type Prediction Report:")
print(classification_report(y3_test, y3_pred, zero_division=0))

# Cross-validation for ad type
cv_scores = cross_val_score(ad_type_model, X, y3_encoded, cv=5)
print("Cross-validation scores for Ad Type:", cv_scores)
print("Mean cross-validation score for Ad Type:", cv_scores.mean())


Best parameters for Campaign Model: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Campaign Type Prediction Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3





Stratified Cross-validation scores for Campaign Type: [0.5 0.5 0.5 1.  0.5]
Mean Stratified CV score for for Campaign Type: 0.6


ValueError: Found input variables with inconsistent numbers of samples: [7, 8]