## Preprocessing Tasks

In [1]:
import pandas as pd

df = pd.read_csv("../new_sales_data.csv")
df.head(5)

In [4]:
sales = df.head(50000)
sales.to_csv("reduced.csv")

In [3]:
x= df.copy()


In [4]:

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Replace 2010 with 2022
df.loc[df['InvoiceDate'].dt.year == 2010, 'InvoiceDate'] = df.loc[df['InvoiceDate'].dt.year == 2010, 'InvoiceDate'].map(lambda x: x.replace(year=2022))

# Replace 2011 with 2024
df.loc[df['InvoiceDate'].dt.year == 2011, 'InvoiceDate'] = df.loc[df['InvoiceDate'].dt.year == 2011, 'InvoiceDate'].map(lambda x: x.replace(year=2024))
df.head(5)

In [7]:
df.to_csv("new_customer_data.csv")

In [4]:
import numpy as np
import pandas as pd

# Define the date range
start_date = '2024-01-01'
end_date = '2024-12-31'

# Generate a sequence of dates for the entire year
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a DataFrame with the dates
df = pd.DataFrame({'InvoiceDate': date_range})

# Generate random quantities between 1 and 100 for each day
df['Quantity'] = np.random.randint(150, 40000 , size=len(df))
df.to_csv("forecast.csv")

print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

In [5]:
df = pd.read_csv("segment_data.csv")
df.columns

In [12]:
# Function to create marketing data
def create_marketing_data(df):
    
    # Create the 'Status' column based on 'probability_alive'
    df['Status'] = df['probability_alive'].apply(lambda x: 'inactive' if x < 0.4 else 'regular' if x < 0.6 else 'active')

    # Create a Faker instance
    fake = Faker()

    # List of emails to include
    specific_emails = ['davidtgondo@gmail.com', 'd.gondo@alustudent.com']

    # Function to generate emails
    def generate_email(index):
        if index < len(specific_emails):
            return specific_emails[index]
        return fake.email()

    # Generate emails and ensure specific emails are included
    df['Email'] = [generate_email(i) for i in range(len(df))]

    # Set segment to 'Test' for specific emails
    df.loc[df['Email'].isin(specific_emails), 'Segment'] = 'Test'

    # Select the required columns
    result_df = df[['CustomerID', 'Email', 'Segment', 'Subsegment', 'Status']]

    return result_df

In [13]:
xc =create_marketing_data(df)
xc.head(5)

In [2]:
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes import BetaGeoFitter, GammaGammaFitter


def prepare_data(df, customer_id_col, datetime_col, monetary_value_col, observation_period_end):
    summary = summary_data_from_transaction_data(
        df,
        customer_id_col=customer_id_col,
        datetime_col=datetime_col,
        monetary_value_col=monetary_value_col,
        observation_period_end=observation_period_end
    )
    summary = summary[summary["monetary_value"] > 0]
    return summary





def predict_variables(summary, bgf, ggf, threshold):
    summary['probability_alive'] = bgf.conditional_probability_alive(
        summary['frequency'],
        summary['recency'],
        summary['T']
    )
    summary['predicted_purchases'] = bgf.predict(30, summary['frequency'], summary['recency'], summary['T'])
    summary['predicted_clv'] = ggf.customer_lifetime_value(
        bgf,
        summary['frequency'],
        summary['recency'],
        summary['T'],
        summary['monetary_value'],
        time=1,  # Lifetime expected for the user in months
        freq='D',
        discount_rate=0.01
    )
    summary["estimated_monetary_value"] = ggf.conditional_expected_average_profit(
        summary['frequency'],
        summary['monetary_value']
    )
    return summary

In [22]:
d = df.head(5000)

In [11]:
def fit_models(summary):
    bgf = BetaGeoFitter(penalizer_coef=0.5)
    bgf.fit(summary['frequency'], summary['recency'], summary['T'])

    ggf = GammaGammaFitter(penalizer_coef=0.0)
    ggf.fit(summary['frequency'], summary['monetary_value'])

    return bgf, ggf

In [23]:
summary = prepare_data(d,customer_id_col="CustomerID",datetime_col='InvoiceDate',monetary_value_col='TotalPrice',observation_period_end=max(df["InvoiceDate"]))
summary.head(5)

In [12]:
bgf, ggf = fit_models(summary)

In [24]:
summary = predict_variables(summary,bgf, ggf,threshold=0.5)
summary.head(5)

In [25]:
d.to_csv("reduce.csv",index=False)