In [35]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')

In [36]:
df = pd.read_csv('Megastore_Dataset_Task_3 3.csv')

In [37]:
# Select relevant columns for analysis
selected_columns = ['OrderPriority', 'CustomerOrderSatisfaction', 'ProductName', 'Region']
selected_data = df[selected_columns]

In [38]:
# Ordinal Encoding for OrderPriority and CustomerOrderSatisfaction
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
selected_data.loc[:, 'OrderPriority_Encoded'] = ordinal_encoder.fit_transform(selected_data[['OrderPriority']])

In [39]:
ordinal_encoder = OrdinalEncoder(categories=[['Dissatisfied', 'Very Dissatisfied', 'Prefer not to answer', 'Satisfied', 'Very Satisfied']])
selected_data.loc[:, 'CustomerOrderSatisfaction_Encoded'] = ordinal_encoder.fit_transform(selected_data[['CustomerOrderSatisfaction']])

In [40]:
# Label Encoding for 'Region'
label_encoder = LabelEncoder()
selected_data.loc[:, 'Region_Encoded'] = label_encoder.fit_transform(selected_data['Region'])

In [41]:
# One-Hot Encoding for 'ProductName'
one_hot_encoded_products = pd.get_dummies(selected_data['ProductName'], prefix='Product')

In [42]:
# Combine the encoded columns with the original data
encoded_data = pd.concat([selected_data, one_hot_encoded_products], axis=1)

In [43]:
# Save the cleaned dataset to a CSV file
encoded_data.to_csv('Cleaned_Megastore_Data.csv', index=False)

In [44]:
# Group data by OrderID to create transactions
transaction_data = df.groupby('OrderID')['ProductName'].apply(list)

In [45]:
# Convert transactions into a binary matrix format
transactional_df = pd.get_dummies(transaction_data.apply(pd.Series).stack()).groupby(level=0).sum()

In [46]:
# Ensure binary encoding in the transactional data
transactional_df = transactional_df.applymap(lambda x: 1 if x > 0 else 0)

In [47]:
# Run the Apriori algorithm
frequent_itemsets = apriori(transactional_df, min_support=0.01, use_colnames=True)

In [48]:
# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0, num_itemsets=len(frequent_itemsets))

In [49]:
# Display the association rules
print("Association Rules:\n", rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

Association Rules:
                          antecedents                        consequents  \
0               ( DOLLY GIRL BEAKER)  (CHARLOTTE BAG DOLLY GIRL DESIGN)   
1  (CHARLOTTE BAG DOLLY GIRL DESIGN)               ( DOLLY GIRL BEAKER)   
2               ( DOLLY GIRL BEAKER)        (DOLLY GIRL CHILDRENS BOWL)   
3        (DOLLY GIRL CHILDRENS BOWL)               ( DOLLY GIRL BEAKER)   
4               ( DOLLY GIRL BEAKER)         (DOLLY GIRL CHILDRENS CUP)   

    support  confidence       lift  
0  0.011338    0.555556   9.423077  
1  0.011338    0.192308   9.423077  
2  0.015873    0.777778  19.055556  
3  0.015873    0.388889  19.055556  
4  0.013605    0.666667  18.375000  


In [50]:
# Extract the top 3 rules with the highest lift
top_rules = rules.sort_values(by='lift', ascending=False).head(3)

In [51]:
print("Top 3 Rules:\n", top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Top 3 Rules:
                                              antecedents  \
75153  (CHILDRENS CUTLERY DOLLY GIRL , ALARM CLOCK BA...   
82089  (ALARM CLOCK BAKELIKE RED , SET6 RED SPOTTY PA...   
84835  (ALARM CLOCK BAKELIKE RED , SET6 RED SPOTTY PA...   

                                             consequents   support  \
75153  (ALARM CLOCK BAKELIKE RED , ROUND SNACK BOXES ...  0.011338   
82089  (ALARM CLOCK BAKELIKE PINK, ROUND SNACK BOXES ...  0.011338   
84835  (ROUND SNACK BOXES SET OF4 WOODLAND , ALARM CL...  0.011338   

       confidence  lift  
75153         1.0  88.2  
82089         1.0  88.2  
84835         1.0  88.2  
