<a href="https://colab.research.google.com/github/That1GuyWSSC/IA2024_16/blob/master/Apriori_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Apriori Algorithm is a Machine Learning algorithm which is used to gain insight into the structured relationships between different items involved. The most prominent practical application of the algorithm is to recommend products based on the products already present in the user’s cart.

Download the dataset straight from kaggle

In [3]:
# Download dataset from Kaggle
import kagglehub

dataset_path = "mssmartypants/paris-housing-classification"
!kaggle datasets download {dataset_path} -p /content/ --unzip

Dataset URL: https://www.kaggle.com/datasets/mssmartypants/paris-housing-classification
License(s): copyright-authors
Downloading paris-housing-classification.zip to /content
100% 554k/554k [00:00<00:00, 853kB/s]
100% 554k/554k [00:00<00:00, 852kB/s]


Using the ML extended Library

In [4]:
# Imports
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Read the file
path = "/content/ParisHousingClass.csv"
house_data = pd.read_csv(path)

# Prepare data
# To remove missing values: sales_data.dropna(inplace=True)
# To filter qty > 0:        sales_data = sales_data[sales_data["Quantity"] > 0]

# Print head
house_data.tail()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4,176425.9,Basic
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0,4448474.0,Basic
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9,8390030.5,Basic
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4,5905107.0,Basic
9999,1440,84,0,0,49,18412,6,10,1994,1,0,8485,2024,278,1,6,146708.4,Basic


In [5]:
# Display the number of nulls in each column
print(house_data.isnull().sum())

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
category             0
dtype: int64


  and should_run_async(code)


In [6]:
# Filter data for years 2010 to 2016
filtered_houses = house_data[(house_data['made'] >= 2010) & (house_data['made'] <= 2016)]

# Display the filtered data
filtered_houses

  and should_run_async(code)


Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,category
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5,Luxury
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2,Basic
5,39223,36,0,1,17,39489,8,6,2012,0,1,2009,4552,757,0,1,3926647.2,Basic
8,51522,3,0,0,61,9047,8,3,2012,1,1,632,5792,807,1,5,5154055.2,Basic
11,96470,74,1,0,21,92029,4,2,2011,1,1,5414,1172,716,1,9,9652258.1,Basic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,76480,98,0,1,37,40978,5,6,2014,0,0,9737,7886,255,1,5,7651622.4,Basic
9988,2475,47,1,1,37,11367,9,2,2012,1,1,3061,7432,328,0,5,254501.2,Luxury
9990,5537,65,0,1,24,5808,8,6,2012,0,1,3974,5726,762,0,2,560359.2,Basic
9991,96173,12,1,0,36,36812,7,7,2015,1,1,837,3716,564,0,9,9623811.5,Basic


In [10]:
# Number of rows
print("Number of rows:", len(filtered_houses))

# Number of distinct values
print("\nNumber of distinct values:")
print("Made:  ", filtered_houses['made'].nunique())
print("Pool: ", filtered_houses['hasPool'].nunique())
print("cityPartRange:    ", filtered_houses['cityPartRange'].nunique())
print("cityCode:    ", filtered_houses['cityCode'].nunique())

# First and last InvoiceDate
print("\nFirst InvoiceDate:", filtered_houses['made'].min())
print("Last InvoiceDate: ", filtered_houses['made'].max())

#Access the list of column names
colnames = filtered_houses.columns
colnames


Number of rows: 2193

Number of distinct values:
Made:   7
Pool:  2
cityPartRange:     10
cityCode:     2171

First InvoiceDate: 2010
Last InvoiceDate:  2016


  and should_run_async(code)


Index(['squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors',
       'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt',
       'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom',
       'hasGuestRoom', 'price', 'category'],
      dtype='object')

In [45]:

grouped = filtered_houses.groupby(['made', 'category'], as_index=False).agg({'hasPool': 'sum'})
grouped.head()


  and should_run_async(code)


Unnamed: 0,made,category,hasPool
0,2010,Basic,105
1,2010,Luxury,37
2,2011,Basic,118
3,2011,Luxury,36
4,2012,Basic,117


In [46]:
# Criar a tabela pivot para o Apriori
pivot = pd.pivot_table(
    data=grouped,
    index='made',
    columns='category',
    values='hasPool',
    aggfunc='sum',
    fill_value=0
)

# Converter os valores em booleanos (True se > 0, False caso contrário)
pivot = pivot > 0

# Exibir as últimas 5 linhas da tabela pivot
print("\nTabela pivot para Apriori:")
pivot.tail(5)



Tabela pivot para Apriori:


  and should_run_async(code)


category,Basic,Luxury
made,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,True,True
2013,True,True
2014,True,True
2015,True,True
2016,True,True


In [48]:
#Get the rules
min_support = 0.001
freq_itemsets = apriori(pivot, min_support=min_support, use_colnames=True)

#Get the number of itemsets in freq_itemsets
num_itemsets = len(freq_itemsets)
print(f'Number of itemsets: {num_itemsets}')
freq_itemsets.head()

rules = association_rules(freq_itemsets, metric="confidence", min_threshold=0.001, num_itemsets=num_itemsets)
rules.head()

Number of itemsets: 3


  and should_run_async(code)
  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Luxury),(Basic),1.0,1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0,1.0,0.0,1.0
1,(Basic),(Luxury),1.0,1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0,1.0,0.0,1.0


In [49]:
# List the 10 rules with higher confidence
rules.sort_values('confidence', ascending=False).head(10)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Luxury),(Basic),1.0,1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0,1.0,0.0,1.0
1,(Basic),(Luxury),1.0,1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0,1.0,0.0,1.0


In [50]:
#Create a DataFrame with distinct category and hasYard
distinct_items = filtered_houses[['made', 'hasPool']].drop_duplicates()

#Display the first 10 rows of the DataFrame
print("\nLista de itens distintos:")
distinct_items.head(10)


Lista de itens distintos:


  and should_run_async(code)


Unnamed: 0,made,hasPool
1,2015,1
3,2012,0
5,2012,1
11,2011,0
16,2011,1
17,2013,1
18,2016,1
33,2014,1
44,2016,0
61,2013,0


In [52]:
# Função para obter o valor de 'hasPool' com base em 'made'
def get_made_hasPool(made):
    try:
        return distinct_items[distinct_items['made'] == made]['hasPool'].iloc[0]
    except IndexError:
        return "Not found"

# Função para aplicar a lógica para todos os valores únicos de 'made'
def generate_made_hasPool_data():
    # Obter valores únicos de 'made'
    unique_made = distinct_items['made'].unique()

    # Preparar os resultados
    results = []
    for made in unique_made:
        result = {
            'made': made,
            'hasPool': get_made_hasPool(made)
        }
        results.append(result)

    # Converter para um DataFrame
    results_df = pd.DataFrame(results)

    return results_df

# Aplicar a função e exibir o DataFrame gerado
made_hasPool_df = generate_made_hasPool_data()
print(made_hasPool_df)


   made  hasPool
0  2015        1
1  2012        0
2  2011        0
3  2013        1
4  2016        1
5  2014        1
6  2010        0


  and should_run_async(code)
