# Understanding the dataset

In [None]:
import pandas as pd
df = pd.read_csv("2019-Dec.csv")
df.head(3)

In [None]:
df.describe()

In [None]:
df['event_type'].value_counts()

In [None]:
df['brand'].value_counts()

In [None]:
df.isnull().sum()

# Designing the Retrieval Model with LangChain

In [None]:
!conda install elasticsearch

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch(["http://localhost:9200"])

if not es.ping():
    raise ValueError("Connection failed")
else:
    print("Connected to Elasticsearch!")


In [None]:
mapping = {
    "mappings": {
        "properties": {
            "event_time": {"type": "date"},
            "event_type": {"type": "keyword"},
            "product_id": {"type": "integer"},
            "category_id": {"type": "long"},
            "category_code": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "brand": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "price": {"type": "float"},
            "user_id": {"type": "long"},
            "user_session": {"type": "keyword"}
        }
    }
}


In [None]:
es.indices.create(index='ecommerce_data', body=mapping, ignore=400)

In [None]:
from elasticsearch import helpers
from tqdm import tqdm

def generate_data(df):
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Indexing documents"):
        doc = {
            "_index": "ecommerce_data",
            "_source": {
                "event_time": pd.to_datetime(row['event_time']).isoformat() if pd.notna(row['event_time']) else None,
                "event_type": row['event_type'],
                "product_id": int(row['product_id']),
                "category_id": int(row['category_id']),
                "category_code": row['category_code'] if pd.notna(row['category_code']) else None,
                "brand": row['brand'] if pd.notna(row['brand']) else None,
                "price": float(row['price']),
                "user_id": int(row['user_id']),
                "user_session": row['user_session'] if pd.notna(row['user_session']) else None
            }
        }
        yield doc

success, _ = helpers.bulk(es, generate_data(df))
print(f"Indexed {success} documents successfully.")


In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(openai_api_key='XXX', model="text-davinci-004") # Replace ‘XXX’ with your API key

In [None]:
def retrieve_data_from_es(query):
    response = es.search(index="ecommerce_data", body={"query": {"match": query}})
    return response['hits']['hits']

In [None]:
query = {"user_id": "576802932"}
data = retrieve_data_from_es(query)

In [None]:
removal_example = next(item for item in data if item['_source']['event_type'] == 'remove_from_cart')
view_example = next(item for item in data if item['_source']['event_type'] == 'view')
print("Removal Example:\n", removal_example)
print("\nView Example:\n", view_example)

# Determining the Campaign Strategy

## Timing

In [None]:
import matplotlib.pyplot as plt

df['event_time'] = pd.to_datetime(df['event_time'])
df['time_of_day'] = df['event_time'].dt.hour

time_of_day_data = df.groupby(['time_of_day', 'event_type']).size().unstack()

fig, ax = plt.subplots(figsize=(12, 6))
time_of_day_data.plot(ax=ax, title='User Interactions by Time of Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Events')
plt.xticks(range(0, 24))
plt.grid(True)
plt.show()


In [None]:
top_brands = df['brand'].value_counts().nlargest(5).index
brand_event_type_counts = df[df['brand'].isin(top_brands)].groupby(['brand', 'event_type']).size().unstack()
brand_event_type_counts


In [None]:
abandon_rate_bpw = brand_event_type_counts.loc['bpw.style', 'remove_from_cart'] / brand_event_type_counts.loc['bpw.style', 'cart']
print(f"Cart Abandonment Rate for bpw.style: {abandon_rate_bpw:.2f}")


## Product discounts

In [None]:
def generate_content(data):
    if not data:
        return "No data available to generate content."

    messages = [("system", "You are an assistant that generates marketing strategies based on user activities.")]
    for item in data:
        source = item['_source']
        product_description = f"{source['event_type']} the product {source['brand']} priced at ${source['price']} on {source['event_time']}."
        messages.append(("human", product_description))

    messages.append(("human", "Based on these interactions, suggest a targeted marketing message to improve engagement that focuses on product discounts."))

    try:
        response = llm.invoke(messages)
        return response
    except Exception as e:
        return f"Error generating content: {str(e)}"


In [None]:
query = {"user_id": "576802932"}
data = retrieve_data_from_es(query)
generate_content(data)

## Example 2: Upsell

In [None]:
def generate_upsell_content(data):
    if not data:
        return "No data available to generate content."

    messages = [("system", "You are an assistant that generates upsell opportunities based on user purchase history.")]
    for item in data:
        source = item['_source']
        messages.append(("human", f"Identify complementary products for {source['brand']} priced at ${source['price']} that were viewed but not purchased on {source['event_time']}."))

    messages.append(("human", "Suggest an upselling strategy that could be included in a follow-up marketing email."))

    try:
        response = llm.invoke(messages)
        return response.content
    except Exception as e:
        return f"Error generating content: {str(e)}"


In [None]:
query = {"user_id": "576802932"}
data = retrieve_data_from_es(query)
output = generate_upsell_content(data)
print(output)


## Example 3: bpw.style brand targeting

In [None]:
def retrieve_bpw_style_data(es_client):
    query = {
        "bool": {
            "must": [
                {"match": {"brand": "bpw.style"}},
                {"terms": {"event_type": ["cart", "remove_from_cart"]}}
            ]
        }
    }
    response = es_client.search(index="ecommerce_data", body={"query": query, "size": 100})
    return response['hits']['hits']


In [None]:
def generate_reengagement_content(es_client):
    data = retrieve_bpw_style_data(es_client)
    if not data:
        return "No data available to generate content."
    
    messages = [
        ("system", "You are an assistant that creates re-engagement strategies for users who have shown interest in bpw.style products but abandoned their carts.")
    ]
    
    for item in data:
        source = item['_source']
        interaction_desc = f"User showed interest in {source['brand']} priced at ${source['price']} but abandoned the cart on {source['event_time']}."
        messages.append(("human", interaction_desc))
    
    messages.append(("human", "Generate a personalized email to re-engage the user and encourage them to complete their purchase."))
    
    try:
        response = llm.invoke(messages)
        return response.content
    except Exception as e:
        return f"Error generating content: {str(e)}"


In [None]:
marketing_message = generate_reengagement_content(es)
print(marketing_message)