In this notebook I implement a content based recommender system, based on the available description of groups (their degree, year, semester).

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math


In [2]:
groups = pd.read_pickle("groups.pickle").rename({"title": "group_title"}, axis=1)
users = pd.read_pickle("users.pkl")
groups_membership = pd.read_pickle("df_membership.pkl")


In [3]:
degrees_courses = pd.read_csv("database/university_coursedegree.csv").drop(
    columns=["id"]
)
degrees = (
    pd.read_csv("database/university_degree.csv")
    .rename(
        {
            "id": "degree_id",
            "name": "degree_name",
            "group_id": "degree_group_id",
            "type": "degree_type",
        },
        axis=1,
    )
    .drop(columns=["department_id", "slug", "icon"])
)
courses = (
    pd.read_csv("database/university_course.csv")
    .drop(columns=["cfu", "wiki_link", "slug_unimi"])
    .rename(
        {"id": "course_id", "group_id": "course_group_id", "name": "course_name"},
        axis=1,
    )
)


In [4]:
courses.head()


Unnamed: 0,course_id,course_name,course_group_id
0,23,Linguaggi di programmazione,-1001436000000.0
1,30,Linguaggi e traduttori,-1001190000000.0
2,3,Architettura degli elaboratori 1,-1001188000000.0
3,1,Matematica del continuo,-1001389000000.0
4,4,Linguaggi formali e automi,-1001217000000.0


In [5]:
pd.to_pickle(degrees, "degrees.pkl")
degrees.head()


Unnamed: 0,degree_id,degree_name,degree_type,degree_group_id
0,71,Lingue e culture per la comunicazione e la coo...,M,-1001520507045
1,97,"Scienza, tecnica e didattica dello sport",M,-1001508132234
2,98,Scienze agrarie,M,-1001549161533
3,7,Sicurezza informatica,M,-1001215714502
4,155,Economics and political science (EPS),M,-1001452282561


In [6]:
joined_course_degrees = (
    groups.merge(courses.rename({"course_group_id": "group_id"}, axis=1), how="left")
    .merge(degrees_courses, how="left")
    .merge(degrees, how="left")
    .drop(columns=["degree_group_id"])
    .rename(
        {
            "course_group_id": "group_id",
            "degree_id": "course_degree_id",
            "degree_name": "course_degree_name",
            "degree_type": "course_degree_type",
        },
        axis=1,
    )
)
joined_course_degrees.head()


Unnamed: 0,group_id,group_title,course_id,course_name,year,semester,course_degree_id,course_degree_name,course_degree_type
0,-1001563734995,Quantitative biology | StudentiUniMi,,,,,,,
1,-1001557200491,Educazione professionale | StudentiUniMi,,,,,,,
2,-1001774201871,Algebra 1 | Matematica,232.0,Algebra 1,1.0,1.0,9.0,Matematica,B
3,-1001724030284,Lettatura italiana Baragetti | StudentiUniMi,239.0,Letteratura italiana Baragetti,1.0,1.0,23.0,Storia,B
4,-1001531478970,Archeologia | StudentiUniMi,,,,,,,


In [7]:
joined_groups_degrees = (
    groups.rename({"group_id": "degree_group_id"}, axis=1)
    .merge(degrees, how="left")
    .rename(
        {
            "degree_group_id": "group_id",
            "degree_id": "group_degree_id",
            "degree_name": "group_degree_name",
            "degree_type": "group_degree_type",
        },
        axis=1,
    )
)
joined_groups_degrees.head()  # information on main groups for each degree


Unnamed: 0,group_id,group_title,group_degree_id,group_degree_name,group_degree_type
0,-1001563734995,Quantitative biology | StudentiUniMi,88.0,Quantitative biology,M
1,-1001557200491,Educazione professionale | StudentiUniMi,61.0,Educazione professionale,B
2,-1001774201871,Algebra 1 | Matematica,,,
3,-1001724030284,Lettatura italiana Baragetti | StudentiUniMi,,,
4,-1001531478970,Archeologia | StudentiUniMi,40.0,Archeologia,M


After performing some joins, I manage to have the information I need for each group.

In [8]:
joined_groups_courses_degrees = joined_groups_degrees.merge(
    joined_course_degrees, on=["group_id", "group_title"]
)

copied = joined_groups_courses_degrees.copy()
for column in ["degree_id", "degree_name", "degree_type"]:
    copied[column] = joined_groups_courses_degrees[f"group_{column}"].combine_first(
        joined_groups_courses_degrees[f"course_{column}"]
    )
    copied = copied.drop(columns=[f"group_{column}", f"course_{column}"])

combined_groups_courses_degrees = copied
combined_groups_courses_degrees.replace(-2, 6, inplace=True)
combined_groups_courses_degrees.replace("B", 1, inplace=True)
combined_groups_courses_degrees.replace("M", 2, inplace=True)
combined_groups_courses_degrees.replace("C", 3, inplace=True)

extra_groups = combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees["degree_id"].isna()
]

combined_groups_courses_degrees.head()


Unnamed: 0,group_id,group_title,course_id,course_name,year,semester,degree_id,degree_name,degree_type
0,-1001563734995,Quantitative biology | StudentiUniMi,,,,,88.0,Quantitative biology,2.0
1,-1001557200491,Educazione professionale | StudentiUniMi,,,,,61.0,Educazione professionale,1.0
2,-1001774201871,Algebra 1 | Matematica,232.0,Algebra 1,1.0,1.0,9.0,Matematica,1.0
3,-1001724030284,Lettatura italiana Baragetti | StudentiUniMi,239.0,Letteratura italiana Baragetti,1.0,1.0,23.0,Storia,1.0
4,-1001531478970,Archeologia | StudentiUniMi,,,,,40.0,Archeologia,2.0


I add a new course degree for a specific category of groups (For24 groups, not officially in a degree), but of common interest for users.

In [9]:
# adding FOR24 groups
for_24 = extra_groups[
    (extra_groups.group_title.str.contains("FOR"))
    | (extra_groups.group_id == -1001672191242)
]
combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees.group_id.isin(for_24.group_id), "degree_id"
] = (degrees.degree_id.max() + 1)
combined_groups_courses_degrees[
    combined_groups_courses_degrees.group_id.isin(for_24.group_id)
]
extra_groups = combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees["degree_id"].isna()
]


In [10]:
# adding degrees to some non-classified groups
combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees.group_id == -1001448722502, "degree_id"
] = 3.0
combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees.group_id == -1001525285619, "degree_id"
] = 5.0
extra_groups = combined_groups_courses_degrees.loc[
    combined_groups_courses_degrees["degree_id"].isna()
]


Then I devide the groups that don't have a course degree into groups for international matters and general extra groups (like notes exchange, job adverts..)

In [11]:
international_groups = extra_groups[
    (extra_groups.group_id == -1001353996639)
    | (extra_groups.group_id == -1001699979466)
]
utility_groups = extra_groups[
    ~extra_groups.group_id.isin(international_groups.group_id)
]


In [12]:
combined_groups_courses_degrees.to_pickle("groups_degrees.pkl")


Here I generate a vector representation of each group. Each group is described by a vector of features, in which the first 158 correspond to the course degrees. <br>
If the vector_rep[i]+1 = 1 (with i <= 158), then i is the associated course degree for that group (from the degree id in the degree table). <br>
After these features I add  the year representation to the vector (from first to fifth year, with elective courses encoded as in the sixth position). <br>
I then add two other two features that, if set to 1, indicate if the group is about international matters or a general utility group respectivly.

In [13]:
def generate_group_representation(group):
    # [1° 2° 3° 4° 5° year and elective(6)]
    # we are adding a new degree for FOR24 which wasn't in the degree table
    YEARS_OFFSET = degrees.degree_id.max()
    YEARS_VALUES = 6

    # [1 2]
    SEMESTER_OFFSET = YEARS_OFFSET + YEARS_VALUES
    SEMESTER_VALUES = 2

    """
    # [1 bachelor 2 master 3 single cycle]
    TYPE_OFFSET = SEMESTER_OFFSET + SEMESTER_VALUES
    TYPE_VALUES = 3
    """

    EXTRA_OFFSET = SEMESTER_OFFSET + SEMESTER_VALUES
    EXTRA_VALUES = 2

    qb_representation = np.zeros(
        degrees.degree_id.max()
        + 1
        + YEARS_VALUES
        + SEMESTER_VALUES
        #+ TYPE_VALUES
        + EXTRA_VALUES
    )

    degree_ids = combined_groups_courses_degrees[
        combined_groups_courses_degrees.group_id == group
    ].degree_id.values
    years = combined_groups_courses_degrees[
        combined_groups_courses_degrees.group_id == group
    ].year

    for degree_id in degree_ids:
        for year in years:
            if not math.isnan(degree_id) and not math.isnan(year):
                qb_representation[int(degree_id) - 1] = 0.6
                qb_representation[int(year) + YEARS_OFFSET] = 0.2
            elif math.isnan(year) and not math.isnan(degree_id):
                qb_representation[int(degree_id) - 1] = 1

    semesters = combined_groups_courses_degrees[
        combined_groups_courses_degrees.group_id == group
    ].semester
    for semester in semesters:
        if not math.isnan(semester):
            qb_representation[int(semester) + SEMESTER_OFFSET] = 0.1

    if group in international_groups.group_id.values:
        qb_representation[EXTRA_OFFSET + 1] = 1

    if group in utility_groups.group_id.values:
        qb_representation[EXTRA_OFFSET + 2] = 1

    return qb_representation


group_description = pd.Series(dtype=object)
for group_id in combined_groups_courses_degrees.group_id.unique():
    group_description.loc[group_id] = generate_group_representation(group_id)

group_description.head()


-1001563734995    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
-1001557200491    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
-1001774201871    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, ...
-1001724030284    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
-1001531478970    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
dtype: object

In [15]:
pd.to_pickle(group_description, "group_description.pkl")


Then I represent each user as a vector of the same dimension, with the values beeing an average between the vectors of all groups he is.

In [16]:
def generate_user_profile(user_id):
    in_groups = groups_membership[groups_membership.user_id == user_id].group_id
    user_profile = np.mean(group_description[in_groups])
    return user_profile


user_profile = pd.Series(dtype=object)
for user in tqdm(
    groups_membership.user_id.unique(), total=len(groups_membership.user_id.unique())
):
    user_profile.loc[user] = generate_user_profile(user)

user_profile.head()


  0%|          | 0/6174 [00:00<?, ?it/s]

1292286374    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1775427491    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
167020566     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1131687203    [0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.0, 0.0, 0.0, ...
1922202100    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
dtype: object

I compute the probability of a particular user being interested in a particular group (in which he isn't in) as the cosine similarity between the user vector representation and the representation of that group. The most probable k are the ones I'm gonna recommend to that user.

In [17]:
trainset = pd.read_pickle("trainset.pkl")


def get_not_in_groups(user_id):
    in_groups = trainset[trainset.user_id == user_id].group_id
    not_in_groups = groups[~(groups.group_id.isin(in_groups))].group_id
    return not_in_groups


def get_prediction(user, group):
    up = user_profile[user]
    return np.dot(up, group_description[group])


predictions = list()
for user in tqdm(trainset.user_id.unique(), total=len(trainset.user_id.unique())):
    not_in = get_not_in_groups(user)
    user_predictions = list()
    for group in not_in:
        pred = get_prediction(user, group)
        predictions.append((int(user), int(group), pred))

predictions = pd.DataFrame(predictions, columns=("user_id", "group_id", "prediction"))
predictions[predictions.user_id==350313104][:5]


  0%|          | 0/4183 [00:00<?, ?it/s]

Unnamed: 0,user_id,group_id,prediction
77632,350313104,-1001563734995,0.0
77633,350313104,-1001557200491,0.0
77634,350313104,-1001774201871,0.010962
77635,350313104,-1001724030284,0.010962
77636,350313104,-1001531478970,0.0


In [18]:
merged_predictions_groups_users = predictions.merge(groups).merge(users)
merged_predictions_groups_users.loc[
    merged_predictions_groups_users.username == "aleceres"
].sort_values(by=["prediction"], ascending=False)[:5]


Unnamed: 0,user_id,group_id,prediction,group_title,first_name,last_name,username
77765,350313104,-1001467027267,0.324038,Sicurezza delle architetture orientate ai serv...,Alessia,,aleceres
77876,350313104,-1001348261542,0.324038,Modellazione e analisi di sistemi - Informatic...,Alessia,,aleceres
77898,350313104,-1001434472267,0.319423,Information management - Sicurezza Informatica,Alessia,,aleceres
77849,350313104,-1001452073794,0.317115,Statistical methods for machine learning - Inf...,Alessia,,aleceres
77666,350313104,-1001556502415,0.314038,Privacy and data protection - Informatica magi...,Alessia,,aleceres


In [19]:
testset = pd.read_pickle("testset.pkl")


def get_top_k(user, k):
    predictions_user = predictions[(predictions.user_id == user)].loc[
        :, ["group_id", "prediction"]
    ]
    return predictions_user.sort_values(by="prediction", ascending=False)[:k]


def is_relevant(user, item):
    return len(testset[(testset.user_id == user) & (testset.group_id == item)]) != 0


def HR(user, k):
    recommended_items = get_top_k(user, k)
    return sum(is_relevant(user, item) for item in recommended_items.group_id)


def average_precision(user, k):
    recommended_items = get_top_k(user, k)
    return sum(
        is_relevant(user, row[1].group_id) * (1 / (rank + 1) * 1)
        for rank, row in enumerate(recommended_items.iterrows())
    )


def RR(user, k):
    recommended_items = get_top_k(user, k)
    return sum(
        is_relevant(user, row[1].group_id) * (1 / (rank + 1))
        for rank, row in enumerate(recommended_items.iterrows())
    )


def precision(user, k):
    recommended_items = get_top_k(user, k)
    return sum(is_relevant(user, item) for item in recommended_items.group_id) * 1 / k


I compute precision and recall metrics in order to evaluate the recommender system we just implemented.

In [20]:
k = 8
print(f"Average P@{k}: {np.mean([precision(user,k) for user in testset.user_id])}")

Average P@8: 0.07171697114111729


In [21]:
print(f"HR@{k}: {np.mean([HR(user,k) for user in testset.user_id])}")

HR@8: 0.5737357691289383


In [22]:
print(f"MAP@{k}: {np.mean([average_precision(user,k) for user in testset.user_id])}")

MAP@8: 0.2809873671470176


In [23]:
print(f"MRR@{k}: {np.mean([RR(user,k) for user in testset.user_id])}")

MRR@8: 0.2809873671470176


In [24]:
def get_not_in_groups(user_id):
    in_groups = groups_membership[groups_membership.user_id == user_id].group_id
    not_in_groups = groups[~(groups.group_id.isin(in_groups))].group_id
    return [group for group in not_in_groups]


all_predictions = list()
for user in tqdm(
    groups_membership.user_id.unique(), total=len(groups_membership.user_id.unique())
):
    not_in = get_not_in_groups(user)
    for group in not_in:
        pred = get_prediction(user, group)
        all_predictions.append((int(user), int(group), pred))

all_predictions = pd.DataFrame(
    all_predictions, columns=("user_id", "group_id", "prediction")
)
all_merged_predictions = all_predictions.merge(users).merge(groups)


  0%|          | 0/6174 [00:00<?, ?it/s]

In [25]:
all_merged_predictions[all_merged_predictions.username == "acetimarco"].sort_values(
    by=["prediction"], ascending=False
)[:8]


Unnamed: 0,user_id,group_id,prediction,first_name,last_name,username,group_title
908300,26170256,-1001466214340,0.2725,Marco,Aceti,acetimarco,Elaborazione dei segnali - Informatica Musical...
619244,26170256,-1001189502801,0.266071,Marco,Aceti,acetimarco,Linguaggi e traduttori - Informatica | Informa...
2155021,26170256,-1001283074624,0.266071,Marco,Aceti,acetimarco,Programmazione Dichiarativa - Informatica | In...
1605265,26170256,-1001300632521,0.265357,Marco,Aceti,acetimarco,Crittografia 1 - Informatica | Informatica mus...
1525878,26170256,-1001338485997,0.265357,Marco,Aceti,acetimarco,Sistemi informativi - Informatica | Informatic...
1707795,26170256,-1001435818491,0.265357,Marco,Aceti,acetimarco,Linguaggi di Programmazione - Informatica | In...
1804225,26170256,-1001404399387,0.265357,Marco,Aceti,acetimarco,Elaborazione Delle Immagini - Informatica
1851668,26170256,-1001469541498,0.265357,Marco,Aceti,acetimarco,Sicurezza e Privatezza - Informatica | Informa...


In [26]:
all_merged_predictions.to_pickle("all_merged_predictions.pkl")
