In [24]:
# Import libraries

## Loading and exploring data
import pandas as pd
import numpy as np
import csv

## Topological data analysis
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from sklearn.decomposition import PCA

In [25]:
## Loading the data
sales=pd.read_csv('sales.csv')

# Remove clients with less than 24 registered sales
customers = sales['customer_id'].value_counts()
customers = customers[customers >= 24].index
sales2 = sales[sales['customer_id'].isin(customers)]

In [26]:
# Remove missing values in 'churn_next_month'
sales2=sales2.dropna(subset='churn_next_month')

# Rename column 'churn_next_month' to 'target'
sales2.rename(columns={'churn_next_month': 'target'}, inplace=True)

# Sort values by 'customer_id' and 'month'
sales2 = sales2.sort_values(by=['customer_id', 'month'])
df = sales2

# Join all sales of a customer in a single row, as a pseudo-time series
pivot_df = df.pivot_table(index='customer_id', columns='month', values='amount', aggfunc='sum')
target_sum = df.groupby('customer_id')['target'].sum()
pivot_df = pivot_df.join(target_sum)

# Rename columns as order of observation
pivot_df = pivot_df.rename(columns=lambda x: f'month_{x}' if isinstance(x, int) else x)
pivot_df.reset_index(inplace=True)
df = pivot_df

# Displace all non-null values to the end of the row
def move_non_nulls_to_end(row):
    '''
    This functuon moves all non-null values to the end of the row.

    Args: a row of a DataFrame (pd.Series)

    Output: the input row with non-null values moved to the end (pd.Series)
    '''
    non_nulls = row.dropna().tolist()
    nulls = [np.nan] * (len(row) - len(non_nulls))
    return pd.Series(nulls + non_nulls)


df_shifted = df.copy()

# In the neew DataFrame, move all non-null values to the end of the row
df_shifted.iloc[:, 1:] = df.iloc[:, 1:].apply(move_non_nulls_to_end, axis=1)

# Replace null values with 0
df_shifted = df_shifted.fillna(0)
df=df_shifted

# Remove first and last columns
columns = df.columns[1:-1]

# Calculate the percentage change of each column with respect to the previous one
df_percent_change = df[columns].pct_change(axis=1) * 100

# Replace infinite values with 100
df_percent_change.replace([np.inf, -np.inf], 100, inplace=True)

# Replace NaN values with 0
df_percent_change.fillna(0, inplace=True)

# Add the first and last columns (customer_id and target)
df_percent_change.insert(0, 'ID', df.iloc[:, 0])
df_percent_change['target'] = df['target']

In [28]:
id_homology = df_percent_change['ID']

filename = 'birthdeath_id.csv'
with open(filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(id_homology)

Error: iterable expected, not int

In [None]:
# Convert time series to point cloud
persistence1 = SingleTakensEmbedding(parameters_type="search", n_jobs=-1)

# Calculate the persistence diagram
homology = VietorisRipsPersistence(metric="euclidean", homology_dimensions=(0, 1), n_jobs=-1)
pca = PCA(n_components=2)

In [None]:
# Generate the persistence diagram for each row (client)
homo_diags = []
for i in range(len(df_percent_change)):
    ts = pd.Series(df_percent_change.iloc[i, 1:-1])         # convert row to pd series
    SOI = ts.values.reshape(-1)                             # keep pnly values
    diagram = persistence1.fit_transform(SOI)               # fit Takens embedding to data
    diagram = diagram.reshape(1,*diagram.shape)             # Adjust shape
    diag = diagram[0]
    comps_pure = pca.fit_transform(diag)                    # Apply PCA to data
    comps = comps_pure.reshape(1, *comps_pure.shape)        # adjust sizes
    homology_diagram = homology.fit_transform(comps)        # fit Vietoris- Rips embedding
    homo_diags.append(homology_diagram)
    #print(diagram.shape)

In [None]:
# Computing Bett numbers 1

b1 = []
for i in range(len(homo_diags)):
    diagramas = homo_diags[i][0]                                    # recover data of H0 and H1 (birth, death, tyoe)
    b_1 = []
    for j in range(len(diagramas)):
        if diagramas[j][2] == 1:                                    # Compute lifespan in relation to H0 and H1
            b_1.append(np.abs(diagramas[j][0] - diagramas[j][1]))   # Join previous result to Beta_1
    b_1.sort()
    b_1.reverse()                                                   # Order from largest to smallest lifespan
    b_1 = b_1[:3]
    b1.append(b_1)

In [None]:
# Load filename
filename = 'homology_output.csv'

# Write CSV for homology
with open(filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(homo_diags)

# Write CSV for homology summary :)
filename = 'birthdeath_output.csv'

with open(filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(b1)