In [3]:
import pandas as pd
import numpy as np

class DataProcessing:
    """
    This class performs basic preprocessing steps, like columns arrangement and renaming as required in the algorithm.

    Attributes:
        csv_path (str): Path of the CSV file.
    """
    def __init__(self, csv_path):
        self.csv_path = csv_path

    def process_data(self):
        """
        Arranging the received csv file into the format best suited for a specific algorithm.

        Returns:
            Preprocessed dataframe.
        """
        # Read the CSV file into a DataFrame
        data = pd.read_csv(self.csv_path)
        
        # Reorder columns and rename them
        data = data[['visitorEmail', 'ad_id', 'clickedOrNot']]
        
        # Replace 'Clicked' and 'Not Clicked' with 1 and 0, respectively
        data['clickedOrNot'] = data['clickedOrNot'].replace({'Clicked': 1, 'Not Clicked': 0})
        
        # Handle non-finite values (NaN or inf) by replacing them with a default value (0)
        data['clickedOrNot'] = data['clickedOrNot'].replace([np.inf, -np.inf, np.nan], 0)
        
        # Convert the column to integers
        data['clickedOrNot'] = data['clickedOrNot'].astype(int)
        
        data = pd.DataFrame(data)
        print(data.head())
        print("Data processed successfully.")
      
        return data

# Example usage:
csv_file_path = "advertisement_final.csv"


# Create an instance of DataProcessing to preprocess the data
data_processor = DataProcessing(csv_file_path)
processed_data = data_processor.process_data()


             visitorEmail                     ad_id  clickedOrNot
0      promod@yopmail.com  64d4e9ff3709c058de9809ec             1
1  kalipopup1@yopmail.com  64d4e66e3709c058de9809cb             0
2  kalipopup1@yopmail.com  64d9af723709c058de980f8d             0
3  kalipopup1@yopmail.com  64d9af9d3709c058de980f94             0
4  kalipopup1@yopmail.com  64d9af4d3709c058de980f86             0
Data processed successfully.


In [4]:

from sklearn.model_selection import train_test_split
from Build_Pipeline.data_preprocessing import DataProcessing

class DataSplitter:
    @staticmethod
    def split_data(csv_file_path, test_size=0.2, random_state=None):
        """
        Splits the dataset into training and testing sets.

        Parameters:
            csv_file_path (str): Path to the CSV file containing the dataset.
            test_size (float): The proportion of the dataset to include in the test split.
            random_state (int): Controls the shuffling applied to the data before splitting.

        Returns:
            train_set (DataFrame): The training set.
            test_set (DataFrame): The testing set.
        """
        # Process the CSV file to get a DataFrame
        data_processor = DataProcessing(csv_file_path)
        data = data_processor.process_data()
        
        print("Before calling split_data method")
 
        # Select features (X) and target variable (y)
        X = data[['visitorEmail', 'ad_id']]  # Features
        y = data['clickedOrNot']  # Target variable

        # Split dataset into training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        print("After calling split_data method")

        # Concatenate features and target variable to create train_set and test_set
        train_set = pd.concat([X_train, y_train], axis=1)
        test_set = pd.concat([X_test, y_test], axis=1)

        print("Data split successful.")  # Print statement to confirm data split
        
        return train_set, test_set

    
        

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from Build_Pipeline.data_preprocessing import DataProcessing

# Path to the CSV file containing the dataset
csv_file_path = "advertisement_final.csv"

# Process the CSV file to get a DataFrame
data_processor = DataProcessing(csv_file_path)
data = data_processor.process_data()

# Select features (X) and target variable (y)
X = data[['visitorEmail', 'ad_id']]  # Features
y = data['clickedOrNot']  # Target variable

# Split dataset into training set and test set
test_size = 0.2
random_state = 42  # Set to None if you don't want to use a random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Concatenate features and target variable to create train_set and test_set
train_set = pd.concat([X_train, y_train], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

print("Data split successful.")  # Print statement to confirm data split


             visitorEmail                     ad_id  clickedOrNot
0      promod@yopmail.com  64d4e9ff3709c058de9809ec             1
1  kalipopup1@yopmail.com  64d4e66e3709c058de9809cb             0
2  kalipopup1@yopmail.com  64d9af723709c058de980f8d             0
3  kalipopup1@yopmail.com  64d9af9d3709c058de980f94             0
4  kalipopup1@yopmail.com  64d9af4d3709c058de980f86             0
Data processed successfully.
Data split successful.


In [27]:
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k
from Build_Pipeline.data_splitting import train_set, test_set
import pandas as pd


# Define model parameters
num_factors = 3
num_epochs = 150
learning_rate = 0.05
loss = 'warp'
item_alpha = 0.0001
user_alpha = 0.0001

# Initialize the LightFM model
model = LightFM(no_components=num_factors, loss=loss, learning_rate=learning_rate, item_alpha=item_alpha, user_alpha=user_alpha)

# Create a Dataset object
dataset = Dataset()

# Fit the dataset on the train_set DataFrame to create the user and item indices
dataset.fit((user for user in train_set['visitorEmail']),
            (item for item in train_set['ad_id']))

# Build the interaction matrix
(interactions, weights) = dataset.build_interactions(((row['visitorEmail'], row['ad_id']) for index, row in train_set.iterrows()))

# Train the model
model.fit(interactions, epochs=num_epochs)
print("Model trained successfully.")

k = 10
# Filter test set to include only user IDs and item IDs present in the training set
filtered_test_set = test_set[(test_set['visitorEmail'].isin(train_set['visitorEmail'])) & (test_set['ad_id'].isin(train_set['ad_id']))]

# Convert the filtered test set to interactions
test_interactions, _ = dataset.build_interactions(((row['visitorEmail'], row['ad_id']) for index, row in filtered_test_set.iterrows()))

# Calculate precision at k
precision = precision_at_k(model, test_interactions, k=k).mean()

# Calculate recall at k
recall = recall_at_k(model, test_interactions, k=k).mean()

print(f"Precision at {k}: {precision}")
print(f"Recall at {k}: {recall}")



Model trained successfully.
Precision at 10: 0.13333334028720856
Recall at 10: 0.4107188743497779
