In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

file_path = r"C:\Users\Staphon Smith\Downloads\Megastore_Dataset_Task_3 3 (1).csv"

df = pd.read_csv(file_path)

In [2]:
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()

In [3]:
# Ordinal Encoding - Order Priority and Expedited Shipping
df['OrderPriority_Encoded'] = df['OrderPriority'].map({'Medium': 1, 'High': 2})
customer_satisfaction_mapping = {
    'Prefer to not respond': 0,
    'Dissatisfied': 1,
    'Very dissatisfied': 2,
    'Satisfied': 3,
    'Very Satisfied': 4
}
df['CustomerOrderSatisfaction_Encoded'] = df['CustomerOrderSatisfaction'].map(customer_satisfaction_mapping)


# One-hot Encoding - Segment and Payment Method
df_encoded = pd.get_dummies(df, columns=['Segment', 'PaymentMethod'])

# Add ordinal Encoded columns
df_encoded['OrderPriority_Encoded'] = df['OrderPriority_Encoded']
df_encoded['CustomerOrderSatisfaction_Encoded'] = df['CustomerOrderSatisfaction_Encoded']

In [4]:
# Encoded Variables visual print
print(df_encoded[['OrderPriority_Encoded', 'CustomerOrderSatisfaction_Encoded',
                  'Segment_Consumer', 'Segment_Corporate',
                  'PaymentMethod_Credit Card', 'PaymentMethod_PayPal']])

      OrderPriority_Encoded  CustomerOrderSatisfaction_Encoded  \
0                         2                                3.0   
1                         2                                3.0   
2                         2                                3.0   
3                         2                                3.0   
4                         2                                3.0   
...                     ...                                ...   
8229                      2                                1.0   
8230                      2                                1.0   
8231                      2                                4.0   
8232                      2                                4.0   
8233                      2                                4.0   

      Segment_Consumer  Segment_Corporate  PaymentMethod_Credit Card  \
0                False               True                       True   
1                False               True                      

In [5]:
# Transactionalize Data
basket = df.groupby(['OrderID', 'ProductName'])['Quantity'].sum().unstack().fillna(0)
basket = basket.gt(0).astype(bool)

# Transactional basket visual
print(basket)



ProductName   50S CHRISTMAS GIFT BAG LARGE   DOLLY GIRL BEAKER  \
OrderID                                                          
536370                               False               False   
536852                               False               False   
536974                               False               False   
537065                               False               False   
537463                               False               False   
...                                    ...                 ...   
581001                               False               False   
581171                               False               False   
581279                               False               False   
581316                               False               False   
581587                               False               False   

ProductName   I LOVE LONDON MINI BACKPACK   NINE DRAWER OFFICE TIDY  \
OrderID                                                               

In [6]:
import pandas as pd  # Make sure pandas is imported

# 1. Create DataFrame with Encoded Variables
df_cleaned_encoded = df_encoded[[
    'OrderID',
    'OrderPriority_Encoded',
    'CustomerOrderSatisfaction_Encoded',
    'Segment_Consumer',
    'Segment_Corporate',  # Assuming you need both, adjust as needed
    'PaymentMethod_Credit Card',
    'PaymentMethod_PayPal' # Assuming you need both, adjust as needed
]]

# 2. Merge with Transactionalized Data ("basket")
df_cleaned_final = pd.merge(
    df_cleaned_encoded,
    basket,
    left_on='OrderID',
    right_index=True,
    how='left'  # Or 'inner' depending on how you want to handle missing baskets
)

# 3. Display and Save
print("First few rows of cleaned and merged data:")
print(df_cleaned_final.head())

print("\nColumn information:")
print(df_cleaned_final.info())  # Very helpful for debugging

df_cleaned_final.to_csv("cleaned_data_for_apriori.csv", index=False)
print("\nCleaned data saved to cleaned_data_for_apriori.csv")

First few rows of cleaned and merged data:
   OrderID  OrderPriority_Encoded  CustomerOrderSatisfaction_Encoded  \
0   536370                      2                                3.0   
1   536370                      2                                3.0   
2   536370                      2                                3.0   
3   536370                      2                                3.0   
4   536370                      2                                3.0   

   Segment_Consumer  Segment_Corporate  PaymentMethod_Credit Card  \
0             False               True                       True   
1             False               True                       True   
2             False               True                       True   
3             False               True                       True   
4             False               True                       True   

   PaymentMethod_PayPal   50S CHRISTMAS GIFT BAG LARGE   DOLLY GIRL BEAKER  \
0                 False        

In [7]:
cleaned_file_path = r"C:\\Users\\Staphon Smith\\Downloads\\Megastore_Dataset_Task_3_cleaned.csv"  # Choose a filename
df.to_csv(cleaned_file_path, index=False)

In [8]:
#Start of Market Basket Analysis
from mlxtend.frequent_patterns import apriori, association_rules

# apply the Apriori algorithm
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)

# Display the itemsets
print(frequent_itemsets)


       support                                           itemsets
0     0.020408                               ( DOLLY GIRL BEAKER)
1     0.011338                     ( I LOVE LONDON MINI BACKPACK)
2     0.013605                 ( SET 2 TEA TOWELS I LOVE LONDON )
3     0.036281                          ( SPACEBOY BABY GIFT SET)
4     0.027211                           (10 COLOUR SPACEBOY PEN)
...        ...                                                ...
8335  0.011338  (PACK OF 20 SKULL PAPER NAPKINS, SET6 RED SPOT...
8336  0.011338  (PACK OF 20 SKULL PAPER NAPKINS, SET6 RED SPOT...
8337  0.011338  (SET6 RED SPOTTY PAPER CUPS, SET6 RED SPOTTY P...
8338  0.011338  (PLASTERS IN TIN SPACEBOY, SET6 RED SPOTTY PAP...
8339  0.011338  (PACK OF 20 SKULL PAPER NAPKINS, SET6 RED SPOT...

[8340 rows x 2 columns]


In [9]:
# Generate association rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Display the first few rules
print(rules.head())



                         antecedents                        consequents  \
0               ( DOLLY GIRL BEAKER)  (CHARLOTTE BAG DOLLY GIRL DESIGN)   
1  (CHARLOTTE BAG DOLLY GIRL DESIGN)               ( DOLLY GIRL BEAKER)   
2        (DOLLY GIRL CHILDRENS BOWL)               ( DOLLY GIRL BEAKER)   
3               ( DOLLY GIRL BEAKER)        (DOLLY GIRL CHILDRENS BOWL)   
4               ( DOLLY GIRL BEAKER)         (DOLLY GIRL CHILDRENS CUP)   

   antecedent support  consequent support   support  confidence       lift  \
0            0.020408            0.058957  0.011338    0.555556   9.423077   
1            0.058957            0.020408  0.011338    0.192308   9.423077   
2            0.040816            0.020408  0.015873    0.388889  19.055556   
3            0.020408            0.040816  0.015873    0.777778  19.055556   
4            0.020408            0.036281  0.013605    0.666667  18.375000   

   representativity  leverage  conviction  zhangs_metric   jaccard  certainty  \

In [10]:
top_rules = rules.sort_values(by='lift', ascending=False).head(3)
top_rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
82681,"(PLASTERS IN TIN SPACEBOY, SET6 RED SPOTTY PAP...","(ALARM CLOCK BAKELIKE GREEN, SET6 RED SPOTTY P...",0.011338,0.011338,0.011338,1.0,88.2,1.0,0.011209,inf,1.0,1.0,1.0,1.0
82978,"(CARD DOLLY GIRL , CHILDRENS CUTLERY SPACEBOY ...","(ROUND SNACK BOXES SET OF4 WOODLAND , CHILDREN...",0.011338,0.011338,0.011338,1.0,88.2,1.0,0.011209,inf,1.0,1.0,1.0,1.0
53651,"(ALARM CLOCK BAKELIKE IVORY, PLASTERS IN TIN S...","(CHARLOTTE BAG DOLLY GIRL DESIGN, ALARM CLOCK ...",0.011338,0.011338,0.011338,1.0,88.2,1.0,0.011209,inf,1.0,1.0,1.0,1.0
