<a href="https://colab.research.google.com/github/TrzeciakPiotr2300/Learning-Recommender_Systems/blob/main/Apriori_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install efficient_apriori




In [3]:
import numpy as np
import pandas as pd
from efficient_apriori import apriori

In [4]:
# Load datasets
movies = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/movies.dat',
                     sep='::',
                     engine='python',
                     names=['movie_id', 'title', 'genres'],
                     encoding='latin-1')

ratings = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/ratings.dat',
                      sep='::',
                      engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
                      encoding='latin-1')

users = pd.read_csv('https://students.mimuw.edu.pl/~pt430187/users.dat',
                    sep='::',
                    engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                    encoding='latin-1')
print("Data loaded!")


Data loaded!


In [5]:
# Function to print basic statistics and preview of dataframes
def statistics(df, name="DataFrame"):
    print(f"Data '{name.upper()}':")
    print("\nShape:", df.shape)
    print("Number of numeric columns:", len(df.select_dtypes(include=['number']).columns),
          "      -->", df.select_dtypes(include=['number']).columns.values)
    print("Number of categorical columns:", len(df.select_dtypes(include=['object', 'bool', 'category']).columns),
          "  -->", df.select_dtypes(include=['object', 'bool', 'category']).columns.values, "\n\n")
    display(df.head(3))

# Show statistics and sample data for each dataframe
statistics(users, "users")
statistics(movies, "movies")
statistics(ratings, "ratings")


Data 'USERS':

Shape: (6040, 5)
Number of numeric columns: 3       --> ['UserID' 'Age' 'Occupation']
Number of categorical columns: 2   --> ['Gender' 'Zip-code'] 




Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


Data 'MOVIES':

Shape: (3883, 3)
Number of numeric columns: 1       --> ['movie_id']
Number of categorical columns: 2   --> ['title' 'genres'] 




Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


Data 'RATINGS':

Shape: (1000209, 4)
Number of numeric columns: 4       --> ['UserID' 'MovieID' 'Rating' 'Timestamp']
Number of categorical columns: 0   --> [] 




Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [6]:
# Remove duplicates and rename columns for consistency
users = users.drop_duplicates()
movies = movies.drop_duplicates()
ratings = ratings.drop_duplicates()

users.columns = ['UserID', 'Gender', 'Age', 'Ocupation', 'Zip-code']
ratings.columns = ['UserID', 'movie_id', 'Rating', 'Timestamp']

# Drop rows with missing essential IDs
users.dropna(axis=0, subset=['UserID'], inplace=True)
movies.dropna(axis=0, subset=['movie_id'], inplace=True)
ratings.dropna(axis=0, subset=['UserID', 'movie_id'], inplace=True)

# Reset index after cleaning
users.reset_index(drop=True, inplace=True)
movies.reset_index(drop=True, inplace=True)
ratings.reset_index(drop=True, inplace=True)

print("Users:", users.shape)
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)


Users: (6040, 5)
Movies: (3883, 3)
Ratings: (1000209, 4)


In [7]:
# Function to convert ratings into boolean (hot encoding)
def hot_encode(x):
    if x <= 0:
        return False
    elif x >= 1:
        return True

# Merge ratings with movie titles, clean columns and sort by user
data = ratings.merge(movies, how='inner', on=['movie_id'])
data.drop(columns=['movie_id', 'Timestamp', 'genres'], axis=1, inplace=True)
data.sort_values(by='UserID', ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)

# Strip whitespace from movie titles
data['title'] = data['title'].str.strip()
data.head()


Unnamed: 0,UserID,Rating,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,1,4,Bambi (1942)
2,1,5,Awakenings (1990)
3,1,4,E.T. the Extra-Terrestrial (1982)
4,1,5,Pocahontas (1995)


In [8]:
# Group movie titles by user to create transaction lists
transactions = data.groupby('UserID')['title'].apply(list).tolist()


In [17]:
# Run the apriori algorithm with specified minimum support, confidence, and max length of itemsets
itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=0.6, max_length=4)

In [23]:
# Print top 5 association rules
print("Top 5 association rules:\n")
for rule in rules[:5]:
    print(rule)

Top 5 association rules:

{2001: A Space Odyssey (1968)} -> {Star Wars: Episode IV - A New Hope (1977)} (conf: 0.805, supp: 0.229, lift: 1.625, conv: 2.586)
{2001: A Space Odyssey (1968)} -> {Star Wars: Episode V - The Empire Strikes Back (1980)} (conf: 0.795, supp: 0.226, lift: 1.606, conv: 2.462)
{2001: A Space Odyssey (1968)} -> {Star Wars: Episode VI - Return of the Jedi (1983)} (conf: 0.714, supp: 0.203, lift: 1.496, conv: 1.827)
{Abyss, The (1989)} -> {Star Wars: Episode IV - A New Hope (1977)} (conf: 0.740, supp: 0.210, lift: 1.494, conv: 1.941)
{Abyss, The (1989)} -> {Star Wars: Episode V - The Empire Strikes Back (1980)} (conf: 0.767, supp: 0.218, lift: 1.550, conv: 2.170)


In [21]:
print("Top 5 frequent itemsets of length 1:\n")
for items, support in list(itemsets[1].items())[:5]:
    print(f"{items} → support: {support:.3f}")

print("\n" + "="*80 + "\n" + "="*80 + "\n")

print("Top 5 frequent itemsets of length 2:\n")
for items, support in list(itemsets[2].items())[:5]:
    print(f"{items} → support: {support:.3f}")

Top 5 frequent itemsets of length 1:

("One Flew Over the Cuckoo's Nest (1975)",) → support: 1725.000
('E.T. the Extra-Terrestrial (1982)',) → support: 2269.000
('Saving Private Ryan (1998)',) → support: 2653.000
('Star Wars: Episode IV - A New Hope (1977)',) → support: 2991.000
('Rain Man (1988)',) → support: 1330.000


Top 5 frequent itemsets of length 2:

('Alien (1979)', 'Matrix, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)') → support: 1239.000
('Alien (1979)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)') → support: 1298.000
('Alien (1979)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Terminator 2: Judgment Day (1991)') → support: 1243.000
('Alien (1979)', 'Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back 