## Import packages

In [None]:
import pandas as pd
import configparser
import os
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import unidecode
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings

In [None]:
# Download stopwords and configure if not already available
nltk.download('stopwords')

french_stopwords = set(stopwords.words('french'))

custom_stopwords = {
    "des", "dun", "cette", "pour", "dune", "un", "deux", "trois", "quatre", "louis", "vuitton", "lv", "modle", "fermeture"       
}

french_stopwords.update(custom_stopwords)

In [None]:
# Dealing with special characters
def remove_accents(text):
    """
    Convert accented characters to their unaccented counterparts.
    E.g., 'é' -> 'e', 'à' -> 'a', 'œ' -> 'oe'
    """
    return unidecode.unidecode(text)

## Loading configuration and data

In [None]:
config = configparser.ConfigParser()
config.read('../cfg/config.ini')

data_dir = config['DEFAULT'].get('data_dir')

product_df_path = os.path.join(data_dir, config['DEFAULT'].get('combined_product', fallback=None))
client_df_path = os.path.join(data_dir, config['DEFAULT'].get('combined_client'))
transac_df_path = os.path.join(data_dir, config['DEFAULT'].get('combined_transac'))

In [None]:
# Note: The separators may differ by file as in your config snippet
product = pd.read_csv(product_df_path, sep=',')
client = pd.read_csv(client_df_path, sep=';')
transac = pd.read_csv(transac_df_path, sep=';')

## Preprocessing product descriptions

In [None]:
# Clean HTML tags from sku_description using BeautifulSoup
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Create a clean text column (handle potential missing values)
product['clean_description'] = product['sku_description'].apply(lambda x: clean_html(x) if pd.notnull(x) else '')

# Tokenization and cleaning
def tokenize(text):
    text = text.lower()
    text = remove_accents(text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    words = text.split()
    tokens = [w for w in words if w not in french_stopwords and len(w) > 2]
    return tokens

product['tokens'] = product['clean_description'].apply(tokenize)

## Compute word frequencies

In [None]:
# Overall frequency in product descriptions
all_words = [word for tokens in product['tokens'] for word in tokens]
word_counts = Counter(all_words)
top_words = word_counts.most_common(20)
print("Top words overall in product descriptions:")
print(top_words)

In [None]:
# Weighted frequency using transaction data
# Merge product and transac on product_id to get sales counts for each product
product_transac = pd.merge(product[['product_id', 'tokens']], transac[['product_id', 'product_quantity']], on='product_id', how='inner')

# Each word is weighted by the number of items sold (product_quantity)
weighted_counter = Counter()
for _, row in product_transac.iterrows():
    qty = row['product_quantity']
    for word in row['tokens']:
        weighted_counter[word] += qty

top_weighted_words = weighted_counter.most_common(20)
print("Top weighted words (by sales volume):")
print(top_weighted_words)

## Trend analysis and forecasting

In [None]:
# Convert week strings (e.g., "W202310") to a datetime.

def week_to_date(week_str):
    year = int(week_str[1:5])
    week = int(week_str[5:])
    return datetime.strptime(f'{year}-W{week}-1', "%Y-W%W-%w")

transac['week_date'] = transac['week'].apply(week_to_date)

# For a subset of top words (e.g., top 5), track weekly sales trends.
top_words_list = [word for word, count in top_weighted_words[:5]]
trend_data = {}

for word in top_words_list:
    # Get product_ids where the description tokens contain the word
    product_ids_with_word = product[product['tokens'].apply(lambda tokens: word in tokens)]['product_id'].unique()
    # Filter transactions for these product_ids
    df_word = transac[transac['product_id'].isin(product_ids_with_word)]
    # Aggregate total sold per week
    sales_by_week = df_word.groupby('week_date')['product_quantity'].sum().sort_index()
    trend_data[word] = sales_by_week

In [None]:
# Simple linear forecasting for the next 4 weeks based on historical data

forecast_results = {}
for word, series in trend_data.items():
    if len(series) < 5:
        continue  # Skip if there isn’t enough data to forecast
    # Create an ordinal time index
    x = np.arange(len(series))
    y = series.values
    # Fit a linear model
    slope, intercept = np.polyfit(x, y, 1)
    forecast_x = np.arange(len(series), len(series) + 4)
    forecast_y = intercept + slope * forecast_x
    forecast_results[word] = forecast_y
    
    # Plot actual sales and forecast
    plt.figure(figsize=(10, 4))
    plt.plot(series.index, y, marker='o', label='Actual Sales')
    # Generate forecast dates using the interval between weeks
    interval = series.index[1] - series.index[0]
    forecast_dates = [series.index[-1] + interval * (i + 1) for i in range(4)]
    plt.plot(forecast_dates, forecast_y, marker='x', linestyle='--', label='Forecast')
    plt.title(f"Sales Trend and Forecast for '{word}'")
    plt.xlabel("Week")
    plt.ylabel("Total Quantity Sold")
    plt.legend()
    plt.show()

## Insights and forecast summary

In [None]:
print("Forecast predictions for the top words:")
for word, forecast in forecast_results.items():
    print(f"Word: '{word}', Forecast for next 4 weeks: {forecast}")