In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

In [5]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None)

In [6]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
              'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [7]:
print(df.head())
print(df.info())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [9]:
df = df.apply(pd.to_numeric, errors='ignore')

In [11]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
item_freq = df.sum().sort_values(ascending=False)
plt.figure(figsize=(10,6))
item_freq.plot(kind='bar')
plt.xlabel('Items')
plt.ylabel('Fréquence')
plt.title('Fréquence des articles')
plt.show()

TypeError: '<' not supported between instances of 'numpy.ndarray' and 'str'

In [None]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

In [None]:
print(rules.head())

In [None]:
rules_filtered = rules[(rules['antecedent_len'] >= 2) & (rules['antecedent_len'] <= 5)]

In [None]:
print(rules_filtered.head(10))

In [None]:
rules_sorted = rules.sort_values(by='lift', ascending=False)
print(rules_sorted.head(5))

In [None]:
rules_income_small = rules[rules['consequents'].apply(lambda x: 'income=small' in str(x))]
rules_income_small_sorted = rules_income_small.sort_values(by='confidence', ascending=False)
print(rules_income_small_sorted.head())

In [None]:
rules_sub = rules[(rules['consequents'].apply(lambda x: 'marital-status=Never-married' in str(x))) & (rules['lift'] > 2)]
print(rules_sub)

In [None]:
rules_sub2 = rules[(rules['antecedents'].apply(lambda x: 'age=Young' in str(x))) & (rules['antecedents'].apply(lambda x: 'workclass=Private' in str(x)))]
print(rules_sub2)