In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Exploratory Data Analysis
---

In [None]:
# Load the dataset
file_path = '/path/to/your/dataset.csv'  # Update this with your actual file path
data = pd.read_csv(file_path)

# ----- Exploratory Data Analysis (EDA) -----

# Basic Overview of the Dataset
print(data.head())
print(data.info())
print(data.describe())

# Missing Values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Distribution of 'Quantity' and 'Price'
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data['Quantity'], kde=True)
plt.title('Quantity Distribution')
plt.subplot(1, 2, 2)
sns.histplot(data['Price'], kde=True)
plt.title('Price Distribution')
plt.show()

# Unique Customers and Products
unique_customers = data['Customer ID'].nunique()
unique_products = data['StockCode'].nunique()
print(f"Number of unique customers: {unique_customers}")
print(f"Number of unique products: {unique_products}")

# Top Selling Products
top_products = data['Description'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_products, y=top_products.index)
plt.title('Top 10 Selling Products')
plt.xlabel('Frequency')
plt.show()

# Purchase Patterns Across Countries
country_order_counts = data['Country'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=country_order_counts, y=country_order_counts.index)
plt.title('Top 10 Countries by Number of Orders')
plt.xlabel('Number of Orders')
plt.show()

## Collaborative Filtering Model

---

In [None]:
# Data Preprocessing for Collaborative Filtering
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['Customer ID'] = data['Customer ID'].fillna(0).astype(int)
data = data[~data['Invoice'].str.contains('C', na=False)]
data = data[(data['Quantity'] > 0) & (data['Price'] > 0) & (data['Customer ID'] != 0)]
interaction_matrix = data.groupby(['Customer ID', 'StockCode']).size().unstack(fill_value=0)

# Function to calculate RMSE
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.sqrt(mean_squared_error(prediction, ground_truth))

# Convert the interaction matrix to a floating-point data type
interaction_matrix_sparse = interaction_matrix.values.astype(float)

# Split the data into training and testing sets
train_data, test_data = train_test_split(interaction_matrix_sparse, test_size=0.2, random_state=42)

# Perform matrix factorization using SVD
u, s, vt = svds(train_data, k=50)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

# Evaluate the model
train_rmse = rmse(X_pred, train_data)
test_rmse = rmse(X_pred, test_data)

print(f"Collaborative Filtering - Train RMSE: {train_rmse}, Test RMSE: {test_rmse}")

### Cross Validation for Collaborative Filtering

---

In [None]:
# Cross Validation for Collaborative Filtering
def cross_validate_cf(data, num_factors_list, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    rmse_scores = {factor: [] for factor in num_factors_list}

    for train_index, test_index in kf.split(data):
        train_data, test_data = data[train_index], data[test_index]

        for factor in num_factors_list:
            u, s, vt = svds(train_data, k=factor)
            s_diag_matrix = np.diag(s)
            X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
            score = rmse(X_pred, test_data)
            rmse_scores[factor].append(score)

    return rmse_scores


# Hyperparameter Tuning
num_factors_list = [20, 50, 100]
cv_scores = cross_validate_cf(interaction_matrix_sparse, num_factors_list)
print(cv_scores)

## Content Based Filtering

---

In [None]:
# Extracting product descriptions
product_descriptions = data[['StockCode', 'Description']].drop_duplicates('StockCode').set_index('StockCode')
product_descriptions = product_descriptions.dropna()

# Split the product descriptions into training and testing sets
train_descriptions, test_descriptions = train_test_split(product_descriptions, test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer on training data
tfidf_train = TfidfVectorizer(stop_words='english')
tfidf_train_matrix = tfidf_train.fit_transform(train_descriptions['Description'])

# Compute the cosine similarity matrix for training data
cosine_sim_train = linear_kernel(tfidf_train_matrix, tfidf_train_matrix)

# Construct a reverse map of indices and product IDs for training data
indices_train = pd.Series(index=train_descriptions.index, data=range(len(train_descriptions)))

# Function to get recommendations based on the training set similarity
def get_recommendations_train(product_id, cosine_sim=cosine_sim_train):
    idx = indices_train[product_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]
    return train_descriptions.iloc[product_indices]

# Example: Get recommendations for a specific product from the training set
example_product_id_train = train_descriptions.index[0]
recommendations_train = get_recommendations_train(example_product_id_train)
print("Content-Based Filtering - Recommendations from Training Set:", recommendations_train)

# Note: Testing in Content-Based Filtering here is more qualitative.
# For each product in the test set, you can get recommendations and 
# manually check if they are sensible and relevant.

### Cross Validation for Content Based Filtering

---

In [None]:
# Experimentation Function for Content-Based Filtering
def experiment_cbf_params(data, ngram_range=(1,1), max_features=None):
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=ngram_range, max_features=max_features)
    tfidf_matrix = tfidf.fit_transform(data['Description'])

    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # [Include the rest of your CBF implementation here]

# Example usage of the experiment function
experiment_cbf_params(train_descriptions, ngram_range=(1,2), max_features=500)