# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [1]:
import pandas as pd
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt



In [7]:
# Load the dataset
try:
    df = pd.read_csv('online_retail.csv', encoding='latin1')
except FileNotFoundError:
    print("Error: online_retail.csv not found.")
    df = None

if df is not None:
    # Drop rows with missing CustomerID
    df.dropna(subset=['CustomerID'], inplace=True)

    # Filter out canceled orders, which have an 'InvoiceNo' starting with 'C'
    df = df[~df['InvoiceNo'].astype(str).str.contains('C', na=False)]

    # Convert 'InvoiceDate' to datetime format
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

    # Create a 'TotalPrice' column
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
    
    print("Data cleaning complete.")

Data cleaning complete.


In [9]:
# ==================================================================================================
# 2. RFM Analysis and Customer Segmentation
# ==================================================================================================

print("Step 2: Performing RFM Analysis...")

# Calculate a reference date as one day after the last invoice date
reference_date = df['InvoiceDate'].max() + timedelta(days=1)

# Group by CustomerID and calculate RFM values
rfm_df = df.groupby('CustomerID').agg(
    Recency=('InvoiceDate', lambda x: (reference_date - x.max()).days),
    Frequency=('InvoiceNo', lambda x: x.nunique()),
    Monetary=('TotalPrice', 'sum')
).reset_index()

# Assign scores (1-4) to RFM values using quartiles
rfm_df['R_score'] = pd.qcut(rfm_df['Recency'], q=4, labels=[4, 3, 2, 1])
rfm_df['F_score'] = pd.qcut(rfm_df['Frequency'].rank(method='first'), q=4, labels=[1, 2, 3, 4])
rfm_df['M_score'] = pd.qcut(rfm_df['Monetary'], q=4, labels=[1, 2, 3, 4])

# Concatenate the RFM scores to create an RFM_Score
rfm_df['RFM_Score'] = rfm_df['R_score'].astype(str) + rfm_df['F_score'].astype(str) + rfm_df['M_score'].astype(str)

# Define a function to map RFM_Score to meaningful customer segments.
def rfm_segment(score):
    if score in ['444', '443', '434', '344', '343']: 
        return 'Champions'
    elif score in ['421', '411', '321', '311', '221', '211', '121', '111']: 
        return 'At Risk'
    elif score in ['441', '431', '341', '331']: 
        return 'Loyal Customers'
    elif score in ['244', '243', '234', '233', '144', '143']: 
        return 'Promising'
    elif score in ['432', '332', '232', '222', '132', '122']: 
        return 'Potential Loyalist'
    elif score in ['414', '314', '413', '313']: 
        return 'New Customers'
    else: 
        return 'Others'

# Apply the segmentation function
rfm_df['Segment'] = rfm_df['RFM_Score'].apply(rfm_segment)

print("RFM analysis and segmentation complete. Displaying the count of customers in each segment:")
print(rfm_df['Segment'].value_counts())
print("-" * 50)

Step 2: Performing RFM Analysis...
RFM analysis and segmentation complete. Displaying the count of customers in each segment:
Segment
Others                1399
At Risk                999
Champions              961
Potential Loyalist     542
Promising              383
Loyal Customers         37
New Customers           18
Name: count, dtype: int64
--------------------------------------------------


In [12]:
# ================================================================================================== 
# 3. Unsupervised Machine Learning – K-Means Clustering
# ==================================================================================================

print("\nPerforming K-Means Clustering...")
    
# Prepare data for clustering by standardizing the RFM values
X = rfm_df[['Recency', 'Frequency', 'Monetary']].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Use a fixed number of clusters (k=4) for demonstration
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
rfm_df['Cluster'] = kmeans.fit_predict(X_scaled)
    
print("K-Means clustering complete.")



Performing K-Means Clustering...
K-Means clustering complete.


In [13]:
# ==================================================================================================
# 4. Collaborative Filtering – Recommendation System
# ==================================================================================================

# Load the dataset from the uploaded CSV file
try:
    df = pd.read_csv('online_retail.csv', encoding='latin1')

    # Drop rows with missing 'CustomerID' and filter out cancelled orders.
    df.dropna(subset=['CustomerID'], inplace=True)
    df = df[~df['InvoiceNo'].astype(str).str.contains('C', na=False)]

    # Convert 'InvoiceDate' to datetime and calculate 'TotalPrice'.
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
    
    print("\nBuilding Collaborative Filtering Recommendation System...")

    # Create a user-item matrix where rows are customers and columns are products.
    # The values represent the quantity of each item purchased.
    user_item_matrix = df.pivot_table(
        index='CustomerID', columns='StockCode', values='Quantity', aggfunc='sum'
    ).fillna(0)

    # Calculate the cosine similarity between all users.
    # This metric measures the similarity in their purchase patterns.
    user_similarity = cosine_similarity(user_item_matrix)
    user_similarity_df = pd.DataFrame(
        user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index
    )

    # Define a function to get product recommendations for a given customer.
    def get_user_recommendations(customer_id, num_recommendations=5):
        if customer_id not in user_similarity_df.index:
            print(f"Customer ID {customer_id} not found in the dataset.")
            return

        # Find the most similar users based on their purchase history.
        user_scores = user_similarity_df[customer_id]
        most_similar_users = user_scores.sort_values(ascending=False).drop(customer_id)

        # Get the items already purchased by the target customer.
        customer_purchased_items = user_item_matrix.loc[customer_id][user_item_matrix.loc[customer_id] > 0]
        
        recommended_items = pd.Series(dtype='float64')

        # Iterate through similar users to find new items to recommend.
        for similar_user_id in most_similar_users.index:
            similar_user_items = user_item_matrix.loc[similar_user_id][user_item_matrix.loc[similar_user_id] > 0]
            new_items_to_recommend = similar_user_items.index.difference(customer_purchased_items.index)

            if not new_items_to_recommend.empty:
                recommended_items = pd.concat([recommended_items, pd.Series(new_items_to_recommend)])
            
            # Stop once enough recommendations have been gathered.
            if len(recommended_items) >= num_recommendations:
                break

        # Get product descriptions for the recommended items.
        item_descriptions = df[['StockCode', 'Description']].drop_duplicates()
        recommended_items_with_desc = item_descriptions[
            item_descriptions['StockCode'].isin(recommended_items.unique()[:num_recommendations])
        ]
        
        if not recommended_items_with_desc.empty:
            print(f"\nRecommendations for Customer {customer_id}:")
            for index, row in recommended_items_with_desc.iterrows():
                print(f" - {row['Description']} (StockCode: {row['StockCode']})")
        else:
            print(f"\nCould not find any new recommendations for Customer {customer_id}.")

    # Example usage with a sample customer ID
    example_customer_id = 12347.0
    print(f"\nGetting recommendations for example customer {example_customer_id}...")
    get_user_recommendations(example_customer_id)

except FileNotFoundError:
    print("Error: online_retail.csv not found. Please ensure the file is in the correct directory.")


Building Collaborative Filtering Recommendation System...

Getting recommendations for example customer 12347.0...

Recommendations for Customer 12347.0:
 - ENAMEL FLOWER JUG CREAM (StockCode: 22427)
 - GROW YOUR OWN BASIL IN ENAMEL MUG (StockCode: 22441)
 - FELTCRAFT 6 FLOWER FRIENDS (StockCode: 22149)
 - ANTIQUE SILVER TEA GLASS ETCHED (StockCode: 84946)
 - SET/3 OCEAN SCENT CANDLE JEWEL BOX (StockCode: 72807B)
 - ANTIQUE SILVER T-LIGHT GLASS (StockCode: 84946)
