# <center><b> <em> Feature Engineering </em> </b></center>

### <font color="green"> | - Import des Biblio et Dataframe</font>

In [182]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt

In [206]:
# Définir une fonction de conversion pour la colonne datetime
def parse_datetime(date_string):
    return pd.to_datetime(date_string)

# Spécifier les types de données et les convertisseurs
dtype = {'invoice': str, 'object': int, 'stock_code': object, 'description': object, 'quantity': int, 'unit_price': float, 'customer_id': object, 'country': object}
converters = {'invoice_date': parse_datetime}

# Charger le fichier CSV en utilisant les types de données et les convertisseurs
data = pd.read_csv("../data/silver.csv", dtype=dtype, converters=converters)

In [184]:
data.tail()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1033031,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
1033032,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1033033,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1033034,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France
1033035,581587,POST,POSTAGE,1,2011-12-09 12:50:00,18.0,12680.0,France


In [207]:
data.describe()

Unnamed: 0,quantity,unit_price
count,1033036.0,1033036.0
mean,10.07688,4.61398
std,175.1976,122.3975
min,-80995.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.15
max,80995.0,38970.0


### <font color="green"> || - Ajout column Total Price </font>

In [214]:
data["total_price"] = data["quantity"] * data["unit_price"]

In [235]:
data.head()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,unit_price,customer_id,country,total_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


In [236]:
data = data[(data['stock_code'] != 'TEST001') & (data['stock_code'] != 'D') & (data['stock_code'] != 'M')]

### <font color="green"> ||| - RFM Analysis </font>

La métrique RFM (Récence, Fréquence, Montant) est une méthode d'analyse utilisée dans le domaine du marketing et de la gestion de la relation client. Elle permet de segmenter et d'évaluer la valeur des clients en fonction de trois critères principaux :

    * Récence (Recency) : Cette mesure évalue depuis combien de temps un client n'a pas effectué d'activité ou d'achat. Plus la récence est faible, c'est-à-dire plus le client a interagi récemment avec l'entreprise, plus il est considéré comme actif et potentiellement précieux.

    * Fréquence (Frequency) : Cette mesure évalue la fréquence à laquelle un client effectue des activités ou des achats. Les clients qui interagissent plus souvent avec l'entreprise sont généralement considérés comme plus engagés et fidèles.

    * Montant (Monetary) : Cette mesure évalue la valeur monétaire des activités ou des achats effectués par un client. Elle prend en compte le montant total dépensé par le client sur une période donnée. Les clients qui dépensent davantage sont souvent considérés comme plus précieux pour l'entreprise.

En combinant ces trois mesures, on peut créer des segments de clients plus pertinents et ciblés. Par exemple, un segment "VIP" pourrait être composé de clients récents, fréquents et à haut montant, tandis qu'un segment "à réactiver" pourrait inclure des clients moins actifs sur une longue période.

La métrique RFM permet aux entreprises de mieux comprendre le comportement et la valeur de leurs clients, ce qui leur permet de personnaliser leurs stratégies de marketing, de fidélisation et de service client. Elle est souvent utilisée en conjonction avec des techniques de segmentation et d'analyse de données pour prendre des décisions plus éclairées et optimiser les efforts de marketing et de gestion de la relation client.

## <font color="purple"> 
 - <b> Recency </b> : the number of days between today_date and the last purchase date of this customer  

 - <b> Frequency </b> : the number of purchase of this customer  

 - <b> Monetary </b> : sum of TotalPrice of this customer 

 
 </font>


In [188]:
today_date = dt.datetime(2011, 12, 11)

In [237]:
RFM_data = data.groupby("customer_id").agg({"invoice_date": lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
                                     "invoice": lambda x: x.astype(str)[~x.astype(str).str.startswith('C')].nunique(),
                                     "total_price": lambda TotalPrice: TotalPrice.sum()})
RFM_data.columns = ["recency", "frequency", "monetary"]


RFM_data.head(10)

Unnamed: 0_level_0,recency,frequency,monetary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,4,66.86
12347.0,3,8,4921.53
12348.0,76,5,2019.4
12349.0,19,4,4404.54
12350.0,311,1,334.4
12351.0,376,1,300.93
12352.0,37,9,1889.21
12353.0,205,2,406.76
12354.0,233,1,1079.4
12355.0,215,2,947.61


In [239]:
RFM_data.head()
#utiliser ACP pour visualiser les variables 4 (hue par cluster)

Unnamed: 0_level_0,recency,frequency,monetary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,4,66.86
12347.0,3,8,4921.53
12348.0,76,5,2019.4
12349.0,19,4,4404.54
12350.0,311,1,334.4


In [240]:
df_cancelled = data[data["invoice"].str.contains("C", na=False)]
df_cancelled.head(6)

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,unit_price,customer_id,country,total_price
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01 10:33:00,2.95,16321.0,Australia,-35.4
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01 10:33:00,1.65,16321.0,Australia,-9.9
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01 10:33:00,4.25,16321.0,Australia,-17.0
181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01 10:33:00,2.1,16321.0,Australia,-12.6
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01 10:33:00,2.95,16321.0,Australia,-35.4
183,C489449,21871,SAVE THE PLANET MUG,-12,2009-12-01 10:33:00,1.25,16321.0,Australia,-15.0


In [252]:
df_churn_custmer = data.groupby("customer_id").agg({
    "total_price": [
        lambda x: x[x > 0].sum(),
        lambda x: x[x < 0].sum(),
        lambda x: abs(x[x < 0].sum() * 100 / x[x > 0].sum())
    ]
})

df_churn_custmer.columns = ['payed_amount', 'refund_amount', 'pourcentage']
df_churn_custmer = df_churn_custmer.reset_index()

  lambda x: abs(x[x < 0].sum() * 100 / x[x > 0].sum())
  lambda x: abs(x[x < 0].sum() * 100 / x[x > 0].sum())


In [253]:
df_churn_custmer.head(10)

Unnamed: 0,customer_id,payed_amount,refund_amount,pourcentage
0,12346.0,77353.96,-77287.1,99.913566
1,12347.0,4921.53,0.0,0.0
2,12348.0,2019.4,0.0,0.0
3,12349.0,4428.69,-24.15,0.545308
4,12350.0,334.4,0.0,0.0
5,12351.0,300.93,0.0,0.0
6,12352.0,2009.54,-120.33,5.987938
7,12353.0,406.76,0.0,0.0
8,12354.0,1079.4,0.0,0.0
9,12355.0,947.61,0.0,0.0


In [296]:
rfm_data =pd.merge(RFM_data, df_churn_custmer, on='customer_id')

In [297]:
rfm_data.head()

Unnamed: 0,customer_id,recency,frequency,monetary,payed_amount,refund_amount,pourcentage
0,12346.0,326,4,66.86,77353.96,-77287.1,99.913566
1,12347.0,3,8,4921.53,4921.53,0.0,0.0
2,12348.0,76,5,2019.4,2019.4,0.0,0.0
3,12349.0,19,4,4404.54,4428.69,-24.15,0.545308
4,12350.0,311,1,334.4,334.4,0.0,0.0


In [256]:
rfm_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5899 entries, 0 to 5898
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customer_id    5899 non-null   object 
 1   recency        5899 non-null   int64  
 2   frequency      5899 non-null   int64  
 3   monetary       5899 non-null   float64
 4   payed_amount   5899 non-null   float64
 5   refund_amount  5899 non-null   float64
 6   pourcentage    5898 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 368.7+ KB


In [298]:
rfm_data = rfm_data[['customer_id' , 'recency', 'frequency' , 'monetary', 'pourcentage']]

In [299]:
rfm_data.head()

Unnamed: 0,customer_id,recency,frequency,monetary,pourcentage
0,12346.0,326,4,66.86,99.913566
1,12347.0,3,8,4921.53,0.0
2,12348.0,76,5,2019.4,0.0
3,12349.0,19,4,4404.54,0.545308
4,12350.0,311,1,334.4,0.0


### * Data num & catég for analyse

In [285]:
unique_customers = data[['customer_id', 'country']].drop_duplicates()

In [287]:
unique_customers.shape

(5927, 2)

In [300]:
mixed_data = pd.merge(rfm_data,unique_customers , on='customer_id')

In [303]:
mixed_data.head()

Unnamed: 0,customer_id,recency,frequency,monetary,pourcentage,country
0,12346.0,326,4,66.86,99.913566,United Kingdom
1,12347.0,3,8,4921.53,0.0,Iceland
2,12348.0,76,5,2019.4,0.0,Finland
3,12349.0,19,4,4404.54,0.545308,Italy
4,12350.0,311,1,334.4,0.0,Norway


### <font color="green"> V - Export Data </font>

In [264]:
#EXporter la data dans un Csv 
df_churn_custmer.to_csv('../data/df_churn_custmer.csv', index=False)

In [265]:
#EXporter la data dans un Csv 
rfm_data.to_csv('../data/rfm.csv', index=False)

In [304]:
#EXporter la data dans un Csv 
mixed_data.to_csv('../data/mixed_data.csv', index=False)