<a href="https://colab.research.google.com/github/TrzeciakPiotr2300/Learning-Recommender_Systems/blob/main/Lab12_Apriori_algorithm_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install efficient_apriori


Collecting efficient_apriori
  Downloading efficient_apriori-2.0.6-py3-none-any.whl.metadata (6.7 kB)
Downloading efficient_apriori-2.0.6-py3-none-any.whl (14 kB)
Installing collected packages: efficient_apriori
Successfully installed efficient_apriori-2.0.6


In [3]:
import numpy as np
import pandas as pd
from efficient_apriori import apriori

In [4]:
# Load datasets
movies = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/movies.dat',
                     sep='::',
                     engine='python',
                     names=['movie_id', 'title', 'genres'],
                     encoding='latin-1')

ratings = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/ratings.dat',
                      sep='::',
                      engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
                      encoding='latin-1')

users = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/users.dat',
                    sep='::',
                    engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                    encoding='latin-1')
print("Data loaded!")


Data loaded!


In [5]:
# Function to print basic statistics and preview of dataframes
def statistics(df, name="DataFrame"):
    print(f"Data '{name.upper()}':")
    print("\nShape:", df.shape)
    print("Number of numeric columns:", len(df.select_dtypes(include=['number']).columns),
          "      -->", df.select_dtypes(include=['number']).columns.values)
    print("Number of categorical columns:", len(df.select_dtypes(include=['object', 'bool', 'category']).columns),
          "  -->", df.select_dtypes(include=['object', 'bool', 'category']).columns.values, "\n\n")
    display(df.head(3))

# Show statistics and sample data for each dataframe
statistics(users, "users")
statistics(movies, "movies")
statistics(ratings, "ratings")


Data 'USERS':

Shape: (6040, 5)
Number of numeric columns: 3       --> ['UserID' 'Age' 'Occupation']
Number of categorical columns: 2   --> ['Gender' 'Zip-code'] 




Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


Data 'MOVIES':

Shape: (3883, 3)
Number of numeric columns: 1       --> ['movie_id']
Number of categorical columns: 2   --> ['title' 'genres'] 




Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


Data 'RATINGS':

Shape: (1000209, 4)
Number of numeric columns: 4       --> ['UserID' 'MovieID' 'Rating' 'Timestamp']
Number of categorical columns: 0   --> [] 




Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [6]:
# Remove duplicates and rename columns for consistency
users = users.drop_duplicates()
movies = movies.drop_duplicates()
ratings = ratings.drop_duplicates()

users.columns = ['UserID', 'Gender', 'Age', 'Ocupation', 'Zip-code']
ratings.columns = ['UserID', 'movie_id', 'Rating', 'Timestamp']

# Drop rows with missing essential IDs
users.dropna(axis=0, subset=['UserID'], inplace=True)
movies.dropna(axis=0, subset=['movie_id'], inplace=True)
ratings.dropna(axis=0, subset=['UserID', 'movie_id'], inplace=True)

# Reset index after cleaning
users.reset_index(drop=True, inplace=True)
movies.reset_index(drop=True, inplace=True)
ratings.reset_index(drop=True, inplace=True)

print("Users:", users.shape)
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)


Users: (6040, 5)
Movies: (3883, 3)
Ratings: (1000209, 4)


In [7]:
# Function to convert ratings into boolean (hot encoding)
def hot_encode(x):
    if x <= 0:
        return False
    elif x >= 1:
        return True

# Merge ratings with movie titles, clean columns and sort by user
data = ratings.merge(movies, how='inner', on=['movie_id'])
data.drop(columns=['movie_id', 'Timestamp', 'genres'], axis=1, inplace=True)
data.sort_values(by='UserID', ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)

# Strip whitespace from movie titles
data['title'] = data['title'].str.strip()
data.head()


Unnamed: 0,UserID,Rating,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,1,4,Bambi (1942)
2,1,5,Awakenings (1990)
3,1,4,E.T. the Extra-Terrestrial (1982)
4,1,5,Pocahontas (1995)


In [8]:
# Group movie titles by user to create transaction lists
transactions = data.groupby('UserID')['title'].apply(list).tolist()


In [9]:
# Run the apriori algorithm with specified minimum support, confidence, and max length of itemsets
itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=0.6, max_length=4)

In [19]:
# Print top 5 association rules
rules_sorted_by_confidence_and_lift = sorted(rules, key=lambda rule: (rule.confidence, rule.lift), reverse=True)
print("Top 5 association rules:\n")
for rule in rules_sorted_by_confidence_and_lift[:5]:
    print(rule)

Top 5 association rules:

{Indiana Jones and the Last Crusade (1989), Raiders of the Lost Ark (1981), Star Wars: Episode VI - Return of the Jedi (1983)} -> {Star Wars: Episode V - The Empire Strikes Back (1980)} (conf: 0.962, supp: 0.203, lift: 1.942, conv: 13.119)
{Aliens (1986), Star Wars: Episode IV - A New Hope (1977), Star Wars: Episode VI - Return of the Jedi (1983)} -> {Star Wars: Episode V - The Empire Strikes Back (1980)} (conf: 0.961, supp: 0.207, lift: 1.941, conv: 12.911)
{Alien (1979), Star Wars: Episode V - The Empire Strikes Back (1980), Terminator 2: Judgment Day (1991)} -> {Star Wars: Episode IV - A New Hope (1977)} (conf: 0.958, supp: 0.206, lift: 1.935, conv: 12.125)
{Alien (1979), Matrix, The (1999), Star Wars: Episode V - The Empire Strikes Back (1980)} -> {Star Wars: Episode IV - A New Hope (1977)} (conf: 0.958, supp: 0.205, lift: 1.935, conv: 12.087)
{Jurassic Park (1993), Raiders of the Lost Ark (1981), Star Wars: Episode VI - Return of the Jedi (1983)} -> {Star

In [20]:
def print_top_frequent_itemsets(itemsets, length=1, top_n=5):
    """
    Wyświetla top-N najczęstszych itemsetów o zadanej długości.

    :param itemsets: słownik {długość: {zbiór: wsparcie}}
    :param length: długość itemsetu (np. 1 dla pojedynczych elementów)
    :param top_n: ile najczęstszych itemsetów wypisać
    """
    if length not in itemsets:
        print(f"Brak itemsetów o długości {length}.")
        return

    print(f"\nTop {top_n} frequent itemsets of length {length}:\n")
    top_items = sorted(itemsets[length].items(), key=lambda x: x[1], reverse=True)[:top_n]
    for items, support in top_items:
        print(f"{items} → support: {support:.3f}")


In [22]:
print_top_frequent_itemsets(itemsets, length=1, top_n=7)
print_top_frequent_itemsets(itemsets, length=2, top_n=5)
print_top_frequent_itemsets(itemsets, length=3, top_n=4)
print_top_frequent_itemsets(itemsets, length=4, top_n=3)



Top 7 frequent itemsets of length 1:

('American Beauty (1999)',) → support: 3428.000
('Star Wars: Episode IV - A New Hope (1977)',) → support: 2991.000
('Star Wars: Episode V - The Empire Strikes Back (1980)',) → support: 2990.000
('Star Wars: Episode VI - Return of the Jedi (1983)',) → support: 2883.000
('Jurassic Park (1993)',) → support: 2672.000
('Saving Private Ryan (1998)',) → support: 2653.000
('Terminator 2: Judgment Day (1991)',) → support: 2649.000

Top 5 frequent itemsets of length 2:

('Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)') → support: 2355.000
('Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)') → support: 2228.000
('Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode VI - Return of the Jedi (1983)') → support: 2113.000
('Raiders of the Lost Ark (1981)', 'Star Wars: Episode V - The Empire Strikes Back (1980)') → support: 1999.000
('Matrix, T