# <center><b> <em> Feature Engineering </em> </b></center>

### <font color="green"> | - Import des Biblio et Dataframe</font>

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt

In [13]:
# Définir une fonction de conversion pour la colonne datetime
def parse_datetime(date_string):
    return pd.to_datetime(date_string)

# Spécifier les types de données et les convertisseurs
dtype = {'invoice': str, 'object': int, 'stock_code': object, 'description': object, 'quantity': int, 'unit_price': float, 'customer_id': object, 'country': object}
converters = {'invoice_date': parse_datetime}

# Charger le fichier CSV en utilisant les types de données et les convertisseurs
data = pd.read_csv("../data/silver.csv", dtype=dtype, converters=converters)

# Vérifier les types de données
print(data.dtypes)

invoice                 object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
customer_id             object
country                 object
dtype: object


In [14]:
data.tail()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1033031,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
1033032,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1033033,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1033034,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France
1033035,581587,POST,POSTAGE,1,2011-12-09 12:50:00,18.0,12680.0,France


In [15]:
data.describe()

Unnamed: 0,quantity,unit_price
count,1033036.0,1033036.0
mean,10.07688,4.61398
std,175.1976,122.3975
min,-80995.0,-53594.36
25%,1.0,1.25
50%,3.0,2.1
75%,10.0,4.15
max,80995.0,38970.0


### <font color="green"> || - Ajout column Total Price </font>

In [16]:
data["total_price"] = data["quantity"] * data["unit_price"]

In [17]:
data.head()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,unit_price,customer_id,country,total_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


### <font color="green"> ||| - RFM Analysis </font>

## <font color="purple"> 
 - <b> Recency </b> : the number of days between today_date and the last purchase date of this customer  

 - <b> Frequency </b> : the number of purchase of this customer  

 - <b> Monetary </b> : sum of TotalPrice of this customer 

 
 </font>


In [18]:
today_date = dt.datetime(2011, 12, 11)

In [19]:
rfm = data.groupby('customer_id').agg({'invoice_date': lambda invoice_date: (today_date - invoice_date.max()).days,
                                     'invoice': lambda invoice: invoice.nunique(),
                                     'total_price': lambda total_price: total_price.sum()})

In [20]:
rfm.head()
#ajout pourcentage cancelled
#kmeans num et kproto mixte 
#passer de 800 à 500 mill faire un groupe by par cluster après (savoir qeu'estce qui caractérise un cluster en faisant une moy, describe....)  (justif pour la méthode du coude pour enlever ou ajouter un cluster)
#utiliser ACP pour visualiser les variables 4 (hue par cluster)

Unnamed: 0_level_0,invoice_date,invoice,total_price
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,17,-51.74
12347.0,3,8,4921.53
12348.0,76,5,2019.4
12349.0,19,5,4404.54
12350.0,311,1,334.4


In [21]:
rfm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5942 entries, 12346.0 to 18287.0
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   invoice_date  5942 non-null   int64  
 1   invoice       5942 non-null   int64  
 2   total_price   5942 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 185.7+ KB


### <font color="green"> V - Export gold Data </font>

In [24]:
rfm

Unnamed: 0,customer_id,invoice_date,invoice,total_price
0,12346.0,326,17,-51.74
1,12347.0,3,8,4921.53
2,12348.0,76,5,2019.40
3,12349.0,19,5,4404.54
4,12350.0,311,1,334.40
...,...,...,...,...
5937,18283.0,4,22,2664.90
5938,18284.0,430,2,436.68
5939,18285.0,661,1,427.00
5940,18286.0,477,3,1188.43


In [29]:
#EXporter la data dans un Csv 
# rfm.to_csv('../data/rfm_data.csv', columns=['customer_id', 'invoice_date', 'invoice', 'total_price'], index=False)

rfm.to_csv('rfm_data.csv', columns=['customer_id', 'invoice_date', 'invoice', 'total_price'], index=False)
