In [None]:
!pip install -q pandas scikit-learn

## ALS

In [None]:
!pip install implicit

In [None]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.preprocessing import LabelEncoder

In [None]:
import pandas as pd
import numpy as np

In [None]:
students = pd.read_json("hse_students_combined_50.json").iloc[:20]
projects = pd.read_json("hse_all_projects.json").iloc[:20]
matches = pd.read_json("student_project_manual_matching_top20.json")

In [None]:
# ПОЛОЖИТЕЛЬНЫЕ ВЗАИМОДЕЙСТВИЯ
positive = []
for _, row in matches.iterrows():
    for proj in [row["1-й проект"], row["2-й проект"], row["3-й проект"]]:
        positive.append({"user": row["ФИО"], "item": proj, "rating": 1})
positive_df = pd.DataFrame(positive)

# ОТРИЦАТЕЛЬНЫЙ СЭМПЛИНГ (НЕ ВХОДЯЩИЕ В TOP-3)
all_projects = set(projects["Название проекта"])
negative = []

for fio in matches["ФИО"]:
    liked = set(positive_df[positive_df["user"] == fio]["item"])
    disliked = list(all_projects - liked)
    np.random.shuffle(disliked)
    for proj in disliked[:5]:
        negative.append({"user": fio, "item": proj, "rating": 0.01})  # слабый сигнал

negative_df = pd.DataFrame(negative)

# ОБЪЕДИНЕНИЕ И КОДИРОВАНИЕ
ratings_df = pd.concat([positive_df, negative_df])

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
ratings_df["user_id"] = user_encoder.fit_transform(ratings_df["user"])
ratings_df["item_id"] = item_encoder.fit_transform(ratings_df["item"])

In [None]:
ratings_df = ratings_df[
    ratings_df["user"].isin(user_encoder.classes_) &
    ratings_df["item"].isin(item_encoder.classes_)
]

In [None]:
# СОЗДАНИЕ РАЗРЕЖЕННОЙ МАТРИЦЫ (item × user)
item_user_matrix = csr_matrix((
    ratings_df["rating"].values,
    (ratings_df["item_id"], ratings_df["user_id"])
))

In [None]:
item_user_matrix

In [None]:
# ОБУЧЕНИЕ ALS
als = AlternatingLeastSquares(factors=20, iterations=30, regularization=0.1)
als.fit(item_user_matrix)

In [None]:
# ПОЛУЧАЕМ TOP-3 ДЛЯ КАЖДОГО СТУДЕНТА
user_ids = ratings_df["user_id"].unique()
recommendations = {}

In [None]:
user_ids

In [None]:
recs = als.recommend(8, item_user_matrix.T, N=3, filter_already_liked_items=False)

In [20]:
recs

(array([13, 14, 19], dtype=int32),
 array([0.9635471 , 0.9616766 , 0.95573145], dtype=float32))

In [None]:
for uid in user_ids:
    recs = als.recommend(uid, item_user_matrix.T, N=3, filter_already_liked_items=False)
    student = user_encoder.inverse_transform([uid])[0]
    projects_pred = item_encoder.inverse_transform([rec for rec in recs[0]])
    recommendations[student] = list(projects_pred)

In [None]:
def precision_at_3(top_n, matches_df):
    correct = 0
    total = 0
    for _, row in matches_df.iterrows():
        gt = [row["1-й проект"], row["2-й проект"], row["3-й проект"]]
        pred = top_n.get(row["ФИО"], [])
        correct += int(any(p in gt for p in pred))
        total += 1
    return correct / total

In [21]:
p3 = precision_at_3(recommendations, matches)
print(f"\n Precision@3 (ALS implicit): {p3:.2f}")


 Precision@3 (ALS implicit): 0.45
