<a href="https://colab.research.google.com/github/SALMA55ASHRAF/AI_instant_tasks/blob/master/RFM_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import important libraries

In [None]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = "plotly_white"

df = pd.read_csv("/content/rfm_data (1).csv")
df.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location
0,8814,2023-04-11,943.31,Product C,890075,Tokyo
1,2188,2023-04-11,463.7,Product A,176819,London
2,4608,2023-04-11,80.28,Product A,340062,New York
3,2559,2023-04-11,221.29,Product A,239145,London
4,9482,2023-04-11,739.56,Product A,194545,Paris


##### now we will discover our df its consist of 6 columns :


1. customerID: which is id of customer
2. purchaseDate: which is last date customer buy
3. transctionAmount: money customer pay
4. productInformation: name of product
5. orderID: id of order
6. location: which is location of customer make order from


In [None]:
df.shape

(1000, 6)

In [None]:
df.isnull().sum()

CustomerID            0
PurchaseDate          0
TransactionAmount     0
ProductInformation    0
OrderID               0
Location              0
dtype: int64

#### now iwill calculate our receny , frequancy and monatry value

In [17]:
from datetime import datetime
df['PurchaseDate']=pd.to_datetime(df['PurchaseDate'])
df['Recency'] = (datetime.now().date() - df['PurchaseDate'].dt.date).apply(lambda x:x.days)
df['Recency']

0      420
1      420
2      420
3      420
4      420
      ... 
995    360
996    360
997    360
998    360
999    360
Name: Recency, Length: 1000, dtype: int64

In [19]:
# Calculate Frequency
frequency_data = df.groupby('CustomerID')['OrderID'].count().reset_index()
frequency_data.rename(columns={'OrderID': 'Frequency'}, inplace=True)
df= df.merge(frequency_data, on='CustomerID', how='left')

# Calculate Monetary Value
monetary_data = df.groupby('CustomerID')['TransactionAmount'].sum().reset_index()
monetary_data.rename(columns={'TransactionAmount': 'MonetaryValue'}, inplace=True)
df = df.merge(monetary_data, on='CustomerID', how='left')
df

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue
0,8814,2023-04-11,943.31,Product C,890075,Tokyo,420,1,943.31
1,2188,2023-04-11,463.70,Product A,176819,London,420,1,463.70
2,4608,2023-04-11,80.28,Product A,340062,New York,420,1,80.28
3,2559,2023-04-11,221.29,Product A,239145,London,420,1,221.29
4,9482,2023-04-11,739.56,Product A,194545,Paris,420,1,739.56
...,...,...,...,...,...,...,...,...,...
995,2970,2023-06-10,759.62,Product B,275284,London,360,1,759.62
996,6669,2023-06-10,941.50,Product C,987025,New York,360,1,941.50
997,8836,2023-06-10,545.36,Product C,512842,London,360,1,545.36
998,1440,2023-06-10,729.94,Product B,559753,Paris,360,1,729.94


In [20]:
df.isnull().sum()

CustomerID            0
PurchaseDate          0
TransactionAmount     0
ProductInformation    0
OrderID               0
Location              0
Recency               0
Frequency             0
MonetaryValue         0
dtype: int64

In [21]:
# Define scoring criteria for each RFM value
recency_scores = [5, 4, 3, 2, 1]  # Higher score for lower recency (more recent)
frequency_scores = [1, 2, 3, 4, 5]  # Higher score for higher frequency
monetary_scores = [1, 2, 3, 4, 5]  # Higher score for higher monetary value

# Calculate RFM scores
df['RecencyScore'] = pd.cut(df['Recency'], bins=5, labels=recency_scores)
df['FrequencyScore'] = pd.cut(df['Frequency'], bins=5, labels=frequency_scores)
df['MonetaryScore'] = pd.cut(df['MonetaryValue'], bins=5, labels=monetary_scores)

In [22]:
df.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue,RecencyScore,FrequencyScore,MonetaryScore
0,8814,2023-04-11,943.31,Product C,890075,Tokyo,420,1,943.31,1,1,2
1,2188,2023-04-11,463.7,Product A,176819,London,420,1,463.7,1,1,1
2,4608,2023-04-11,80.28,Product A,340062,New York,420,1,80.28,1,1,1
3,2559,2023-04-11,221.29,Product A,239145,London,420,1,221.29,1,1,1
4,9482,2023-04-11,739.56,Product A,194545,Paris,420,1,739.56,1,1,2


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   CustomerID          1000 non-null   int64         
 1   PurchaseDate        1000 non-null   datetime64[ns]
 2   TransactionAmount   1000 non-null   float64       
 3   ProductInformation  1000 non-null   object        
 4   OrderID             1000 non-null   int64         
 5   Location            1000 non-null   object        
 6   Recency             1000 non-null   int64         
 7   Frequency           1000 non-null   int64         
 8   MonetaryValue       1000 non-null   float64       
 9   RecencyScore        1000 non-null   category      
 10  FrequencyScore      1000 non-null   category      
 11  MonetaryScore       1000 non-null   category      
dtypes: category(3), datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 74.0+ KB


In [24]:
# Convert RFM scores to numeric type
df['RecencyScore'] = df['RecencyScore'].astype(int)
df['FrequencyScore'] = df['FrequencyScore'].astype(int)
df['MonetaryScore'] = df['MonetaryScore'].astype(int)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   CustomerID          1000 non-null   int64         
 1   PurchaseDate        1000 non-null   datetime64[ns]
 2   TransactionAmount   1000 non-null   float64       
 3   ProductInformation  1000 non-null   object        
 4   OrderID             1000 non-null   int64         
 5   Location            1000 non-null   object        
 6   Recency             1000 non-null   int64         
 7   Frequency           1000 non-null   int64         
 8   MonetaryValue       1000 non-null   float64       
 9   RecencyScore        1000 non-null   int64         
 10  FrequencyScore      1000 non-null   int64         
 11  MonetaryScore       1000 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(7), object(2)
memory usage: 93.9+ KB


In [26]:
# Calculate RFM score by combining the individual scores
df['RFM_Score'] = df['RecencyScore'] + df['FrequencyScore'] + df['MonetaryScore']

# Create RFM segments based on the RFM score
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
df['Value Segment'] = pd.qcut(df['RFM_Score'], q=3, labels=segment_labels)

In [27]:
# RFM Segment Distribution
segment_counts = df['Value Segment'].value_counts().reset_index()
segment_counts.columns = ['Value Segment', 'Count']

pastel_colors = px.colors.qualitative.Pastel

# Create the bar chart
fig_segment_dist = px.bar(segment_counts, x='Value Segment', y='Count',
                          color='Value Segment', color_discrete_sequence=pastel_colors,
                          title='RFM Value Segment Distribution')

# Update the layout
fig_segment_dist.update_layout(xaxis_title='RFM Value Segment',
                              yaxis_title='Count',
                              showlegend=False)

# Show the figure
fig_segment_dist.show()

In [28]:
from pickle import DEFAULT_PROTOCOL
from collections import defaultdict
# Create a new column for RFM Customer Segments
df['RFM Customer Segments'] = ''

# Assign RFM segments based on the RFM score
df.loc[df['RFM_Score'] >= 9, 'RFM Customer Segments'] = 'Champions'
df.loc[(df['RFM_Score'] >= 6) & (df['RFM_Score'] < 9), 'RFM Customer Segments'] = 'Potential Loyalists'
df.loc[(df['RFM_Score'] >= 5) & (df['RFM_Score'] < 6), 'RFM Customer Segments'] = 'At Risk Customers'
df.loc[(df['RFM_Score'] >= 4) & (df['RFM_Score'] < 5), 'RFM Customer Segments'] = "Can't Lose"
df.loc[(df['RFM_Score'] >= 3) & (df['RFM_Score'] < 4), 'RFM Customer Segments'] = "Lost"

# Print the updated data with RFM segments
df.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score,Value Segment,RFM Customer Segments
0,8814,2023-04-11,943.31,Product C,890075,Tokyo,420,1,943.31,1,1,2,4,Low-Value,Can't Lose
1,2188,2023-04-11,463.7,Product A,176819,London,420,1,463.7,1,1,1,3,Low-Value,Lost
2,4608,2023-04-11,80.28,Product A,340062,New York,420,1,80.28,1,1,1,3,Low-Value,Lost
3,2559,2023-04-11,221.29,Product A,239145,London,420,1,221.29,1,1,1,3,Low-Value,Lost
4,9482,2023-04-11,739.56,Product A,194545,Paris,420,1,739.56,1,1,2,4,Low-Value,Can't Lose


In [30]:
# RFM Segment Distribution
RFM_Customer_Segments = df['RFM Customer Segments'].value_counts().reset_index()
RFM_Customer_Segments.columns = ['customer Segment', 'Count']

pastel_colors = px.colors.qualitative.Pastel

# Create the bar chart
fig_segment_dist = px.bar(RFM_Customer_Segments, x='customer Segment', y='Count',
                          color='customer Segment', color_discrete_sequence=pastel_colors,
                          title='RFM customer Segment Distribution')

# Update the layout
fig_segment_dist.update_layout(xaxis_title='RFM customer Segment',
                              yaxis_title='Count',
                              showlegend=False)

# Show the figure
fig_segment_dist.show()

In [31]:
segment_product_counts = df.groupby(['Value Segment', 'RFM Customer Segments']).size().reset_index(name='Count')

segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

fig_treemap_segment_product = px.treemap(segment_product_counts,
                                         path=['Value Segment', 'RFM Customer Segments'],
                                         values='Count',
                                         color='Value Segment', color_discrete_sequence=px.colors.qualitative.Pastel,
                                         title='RFM Customer Segments by Value')
fig_treemap_segment_product.show()

In [32]:
# Filter the data to include only the customers in the Champions segment
champions_segment = df[df['RFM Customer Segments'] == 'Champions']

fig = go.Figure()
fig.add_trace(go.Box(y=champions_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=champions_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=champions_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within Champions Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

In [33]:
df['RFM Customer Segments'].value_counts()

RFM Customer Segments
Potential Loyalists    503
At Risk Customers      180
Can't Lose             173
Lost                    82
Champions               62
Name: count, dtype: int64

In [35]:
# Filter the data to include only the customers in the  Can't Lose segment
at_risk_segment = df[df['RFM Customer Segments'] == "Can't Lose"]

fig = go.Figure()
fig.add_trace(go.Box(y=at_risk_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=at_risk_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=at_risk_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within  Cant Lose Customer  Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

In [36]:
# Filter the data to include only the customers in the at risk segment
at_risk_segment = df[df['RFM Customer Segments'] == 'At Risk Customers']

fig = go.Figure()
fig.add_trace(go.Box(y=at_risk_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=at_risk_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=at_risk_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within  At Risk Customer  Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

In [34]:
# Filter the data to include only the customers in the Potential Loyalists segment
PotentialLoyalists_segment = df[df['RFM Customer Segments'] == 'Potential Loyalists']

fig = go.Figure()
fig.add_trace(go.Box(y=PotentialLoyalists_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=PotentialLoyalists_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=PotentialLoyalists_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within Potential Loyalists Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

In [37]:
correlation_matrix = champions_segment[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].corr()

# Visualize the correlation matrix using a heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='RdBu',
                   colorbar=dict(title='Correlation')))

fig_heatmap.update_layout(title='Correlation Matrix of RFM Values within Champions Segment')

fig_heatmap.show()

In [40]:
# Calculate the average Recency, Frequency, and Monetary scores for each segment
segment_scores = df.groupby('RFM Customer Segments')[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].mean().reset_index()

# Create a grouped bar chart to compare segment scores
fig = go.Figure()

# Add bars for Recency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['RecencyScore'],
    name='Recency Score',
    marker_color='rgb(158,202,225)'
))

# Add bars for Frequency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['FrequencyScore'],
    name='Frequency Score',
    marker_color='rgb(94,158,217)'
))

# Add bars for Monetary score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['MonetaryScore'],
    name='Monetary Score',
    marker_color='rgb(32,102,148)'
))

# Update the layout
fig.update_layout(
    title='Comparison of RFM Segments based on Recency, Frequency, and Monetary Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',
    showlegend=True
)

fig.show()