In [None]:
import pandas as pd
from itertools import combinations

def load_dataset(file_path):
    """Loads the dataset and performs basic cleaning."""
    data = pd.read_csv(file_path, delimiter=';', decimal='.', na_values=['NaN', 'NA', '', ' '])
    data.dropna(inplace=True)
    print("Dataset loaded successfully.")
    print(f"Dataset shape: {data.shape}")
    print("Sample data:\n", data.head())
    return data

def preprocess_data(data):
    """Keeps relevant columns and prepares the dataset for transactional analysis."""
    data['Set'] = data['Set'].astype(str) 
    data = data.iloc[:, 7:] 
    print(f"Shape after preprocessing: {data.shape}")
    print(f"Remaining columns: {data.columns.tolist()}")
    print("Sample preprocessed data:\n", data.head())
    return data

def count_transactions_items(data):
    """Counts the number of transactions and items in the preprocessed data."""
    num_transactions = data.shape[0]
    num_items = data.shape[1]
    print(f"Number of Transactions: {num_transactions}")
    print(f"Number of Items: {num_items}")
    return num_transactions, num_items

def create_transactional_format(data):
    """Creates a list of transactions, each represented by a tuple of items."""
    transactions = [
        (row['Label'], row['Category'], row['Set']) 
        for _, row in data.iterrows() 
        if pd.notna(row['Label']) and pd.notna(row['Category']) and pd.notna(row['Set'])
    ]
    print(f"Sample Transactions: {transactions[:5]}")
    return transactions

def generate_k_itemsets(transactions, k):
    """Generates candidate k-itemsets from the transactions."""
    itemsets = {tuple(sorted(itemset)) for transaction in transactions for itemset in combinations(transaction, k)}
    print(f"Candidate {k}-itemsets: {itemsets}")
    return itemsets

def calculate_support(transactions, itemsets):
    """Calculates the support of each candidate itemset."""
    total_transactions = len(transactions)
    support_counts = {
        itemset: sum(1 for transaction in transactions if set(itemset).issubset(transaction)) / total_transactions
        for itemset in itemsets
    }
    return support_counts

def generate_frequent_itemsets(transactions, k, min_support):
    """Generates frequent k-itemsets that meet the minimum support threshold."""
    candidates = generate_k_itemsets(transactions, k)
    support_counts = calculate_support(transactions, candidates)
    frequent_itemsets = {itemset: support for itemset, support in support_counts.items() if support >= min_support}
    print(f"Frequent {k}-itemsets: {frequent_itemsets}")
    return frequent_itemsets

def generate_association_rules(frequent_itemsets):
    """Generates association rules from frequent itemsets."""
    rules = []
    for itemset in frequent_itemsets:
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                consequent = set(itemset) - set(antecedent)
                rules.append((antecedent, tuple(consequent)))
    print(f"Generated rules: {rules}")
    return rules

def calculate_confidence(rules, support_counts):
    """Calculates confidence for each association rule."""
    confidence = {}
    for antecedent, consequent in rules:
        support_antecedent = support_counts.get(antecedent, 0)
        support_both = support_counts.get(tuple(sorted(set(antecedent) | set(consequent))), 0)
        if support_antecedent > 0:
            confidence[(antecedent, consequent)] = support_both / support_antecedent
    return confidence

def main():
    file_path = r'D:\M2\DataMining\TP4\DatasetExos.csv'  # Update with your dataset path
    data = load_dataset(file_path)
    data = preprocess_data(data)

    # Count transactions and items
    num_transactions, num_items = count_transactions_items(data)
    transactions = create_transactional_format(data)
    print(f"Total Transactions Created: {len(transactions)}")

    # Set parameters
    min_support = 0.1  
    k = 1

    # Frequent Itemset Extraction
    frequent_itemsets = {}
    while True:
        frequent_k_itemsets = generate_frequent_itemsets(transactions, k, min_support)
        if not frequent_k_itemsets:
            break
        frequent_itemsets[k] = frequent_k_itemsets
        k += 1

    # Generate association rules
    all_rules = []
    support_counts = {tuple(itemset): support for k_itemsets in frequent_itemsets.values() for itemset, support in k_itemsets.items()}
    for k_itemsets in frequent_itemsets.values():
        rules = generate_association_rules(k_itemsets)
        all_rules.extend(rules)

    confidence_results = calculate_confidence(all_rules, support_counts)

    # Output results
    print("Frequent Itemsets:")
    for k, itemsets in frequent_itemsets.items():
        print(f"Frequent {k}-itemsets: {itemsets}")

    print("\nAssociation Rules:")
    for rule, conf in confidence_results.items():
        antecedent, consequent = rule
        print(f"Rule: {antecedent} => {consequent}, Confidence: {conf:.2f}")

if __name__ == "__main__":
    main()


Dataset loaded successfully.
Dataset shape: (8977, 11)
Sample data:
                    ep (ms)                   Acc_x               Acc_y  \
0  2019-01-11 15:08:05.200                  0.0135               0.977   
1  2019-01-11 15:08:05.400  -0.0014999999999999996  0.9704999999999999   
2  2019-01-11 15:08:05.600   0.0013333333333333333  0.9716666666666667   
3  2019-01-11 15:08:05.800                  -0.024               0.957   
4  2019-01-11 15:08:06.000   -0.027999999999999997  0.9576666666666666   

                  Acc_z          Gyro_x               Gyro_y  \
0                -0.071  -2.094.366.723          257.720.316   
1  -0.07949999999999999         -16.826              -0.8904   
2  -0.06433333333333334     526.942.212  -0.2559999999999999   
3               -0.0735           8.061              -45.244   
4                -0.115           2.439              -15.486   

               Gyro_z ID  Label Category   Set  
0  0.9388000000000002  B  bench    heavy  30.0  
1  