In [None]:
# This assignment focuses on Association Rule Mining, specifically applying the Apriori algorithm for market basket analysis using the Online Retail dataset.
# Below is a step-by-step guide to achieve the objective of discovering interesting relationships between products purchased together and interpreting the results.
# Data Preprocessing
# Step 1: Import Libraries
# We will use Pandas, NumPy, Mlxtend, and matplotlib (for visualizations).
# You will also need to import matplotlib for visualization of rules if necessary.

In [2]:
import pandas as pd

# Replace 'your_file.xlsx' with the path to your Excel file
file_path = r'D:\HI448116_Santosh_Karpe\FY25\DOCS\III\Ass\ASA - SK\Online retail.csv'

# Load the Excel file into a pandas DataFrame
df = pd.read_csv(r'D:\HI448116_Santosh_Karpe\FY25\DOCS\III\Ass\ASA - SK\Online retail.csv')

# Display the first few rows of the DataFrame
print(df.head())

  shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0                             burgers,meatballs,eggs                                                                                                                                                                             
1                                            chutney                                                                                                                                                                             
2                                     turkey,avocado                                                                                                                                                                             
3  mineral water,milk,energy bar,whole wheat rice...                                            

In [None]:
# Filtering Data for Association Rule Mining: For association rule mining, you'll want to focus on the transactions and products.
# A typical preprocessing step is to create a binary matrix where each row represents a transaction, and each column represents whether a product was purchased in that transaction.


In [9]:
import pandas as pd

# Read the CSV file
df_csv = pd.read_csv(r'D:\HI448116_Santosh_Karpe\FY25\DOCS\III\Ass\ASA - SK\Online retail.csv')  # Replace with your actual CSV path

# Step 1: Inspect the raw data to check its structure and for any obvious issues
print("Raw Data:")
print(df_csv.head())

# Step 2: Clean the data
# a. Remove leading/trailing spaces in the 'Transactions' column
df_csv['Transactions'] = df_csv['Transactions'].str.strip()

# b. Remove any rows with missing values in the 'Transactions' column
df_csv = df_csv.dropna(subset=['Transactions'])

# c. Remove empty transactions (if there are rows with empty strings or missing items)
#df_csv = df_csv[df_csv['Transactions'].str.strip().neq('')]

# d. Split the items in each transaction by commas
# This assumes the items in the 'Transactions' column are separated by commas
df_csv['Transactions'] = df_csv['Transactions'].apply(lambda x: [item.strip() for item in x.split(',')])

# e. (Optional) Standardize item names, for example, make all items lowercase
# This step ensures consistency in item names (you can adjust as needed)
df_csv['Transactions'] = df_csv['Transactions'].apply(lambda x: [item.lower() for item in x])

# Step 3: Inspect the cleaned data
print("\nCleaned Data:")
print(df_csv.head())

# Step 4: Now you can proceed with transforming the data using TransactionEncoder
from mlxtend.preprocessing import TransactionEncoder

# Transform the cleaned transactions into the appropriate format for the apriori algorithm
te = TransactionEncoder()
transformed_data = te.fit(df_csv['Transactions']).transform(df_csv['Transactions'])

# Step 5: Create a DataFrame suitable for apriori analysis
df = pd.DataFrame(transformed_data, columns=te.columns_)

# Print the transformed DataFrame to check
print("\nTransformed Data (Boolean DataFrame for Apriori):")
print(df.head())

# Now you can proceed with applying the Apriori algorithm as usual
from mlxtend.frequent_patterns import apriori, association_rules

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)

# Generate association rules (if frequent itemsets are found)
if not frequent_itemsets.empty:
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
    print("\nAssociation Rules:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo frequent itemsets found with the given support threshold.")


Raw Data:
                                        Transactions
0  shrimp,almonds,avocado,vegetables mix,green gr...
1                             burgers,meatballs,eggs
2                                            chutney
3                                     turkey,avocado
4  mineral water,milk,energy bar,whole wheat rice...

Cleaned Data:
                                        Transactions
0  [shrimp, almonds, avocado, vegetables mix, gre...
1                         [burgers, meatballs, eggs]
2                                          [chutney]
3                                  [turkey, avocado]
4  [mineral water, milk, energy bar, whole wheat ...

Transformed Data (Boolean DataFrame for Apriori):
   almonds  antioxydant juice  asparagus  avocado  babies food  bacon  \
0     True               True      False     True        False  False   
1    False              False      False    False        False  False   
2    False              False      False    False        False  False

In [None]:
#