In [6]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt




In [5]:
from src.db import Ratings

ratings_df = Ratings().get_training_data()
# model.train(ratings_df=ratings_df)

2024-12-02 13:20:00:[INFO]:Generated training data with 29 entries.


In [2]:
from src.db import Questions
questions = Questions()

raw_questions = questions.fetch_all()
questions_df = questions.preprocess_questions(raw_questions)
questions_df

Unnamed: 0,question_id,chapter,difficulty,concept,content
0,fc084299-5fff-4543-b5f6-5a7971b93da6,chuong-3,0.83,office,Phát biểu nào sau đây là SAI khi nói về PowerP...
1,a6d25890-eecd-488a-8b89-2455786b3b80,chuong-3,0.40,office,"Khi thiết kế bản trình chiếu, nếu muốn tạo một..."
2,3d9980e3-3d1c-4932-920e-f94a57340888,chuong-3,0.75,ppt,"Khi làm việc với PowerPoint, để loại bỏ hiệu ứ..."
3,21b80886-9511-4cd4-bde3-1cf1878f21a2,chuong-3,0.56,office,"Trong Microsoft PowerPoint, để hiển thị các sl..."
4,239c4195-884a-49ea-80ea-45e18acbe1d4,chuong-3,0.49,office,"Trong Microsoft PowerPoint (2013, 2016, 2019),..."
...,...,...,...,...,...
502,1164b65d-1cba-8192-a242-e01d25e2bc9d,chuong-4,0.64,ham-nhap-xuat-du-lieu,Định dạng nào sau đây dùng để in ra một kí tự:
503,1164b65d-1cba-81dc-b305-d078f5e25994,chuong-4,0.55,ham-nhap-xuat-du-lieu,Định dạng nào sau đây dùng để in ra số thực vớ...
504,1164b65d-1cba-81dd-911a-c2546a76ad11,chuong-4,0.33,ham-nhap-xuat-du-lieu,Hàm nào trong Python để nhập một ký tự từ bàn ...
505,1164b65d-1cba-8165-93e4-c5ad83afc837,chuong-4,0.55,gioi-thieu-python,Python được Guido van Rossum phát triển dựa tr...


In [3]:
df = questions_df

In [7]:
# Step 1: Generate BERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, efficient BERT model
text_features = model.encode(df['content'].tolist())

2024-12-02 13:30:40:[INFO]:Use pytorch device_name: cpu
2024-12-02 13:30:40:[INFO]:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 16/16 [00:01<00:00, 14.15it/s]


In [None]:
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(df[['chapter', 'concept']]).toarray()
scaler = StandardScaler()
scaled_difficulty = scaler.fit_transform(df[['difficulty']])

In [10]:
combined_features = np.concatenate([text_features, encoded_categorical, scaled_difficulty], axis=1)
features = np.hstack([text_features, encoded_categorical, scaled_difficulty])

In [16]:
pca = PCA(n_components=9)
reduced_features = pca.fit_transform(features)
reduced_features

array([[ 1.70439082, -0.79390519, -0.2320044 , ..., -0.40258581,
        -0.0989157 ,  0.01096714],
       [-0.70907775, -0.7739877 , -0.45800861, ..., -0.38516585,
        -0.13559611, -0.01645   ],
       [ 1.2219221 , -0.67205403, -0.19096123, ...,  0.10856871,
        -0.04246766,  0.05999455],
       ...,
       [-1.26209415, -0.14453158,  0.88105854, ..., -0.0415105 ,
         0.12143627,  0.00422906],
       [-0.0392298 , -0.11418241,  0.87630148, ..., -0.07297741,
         0.03684896, -0.01394621],
       [ 0.46726761, -0.12151298,  1.0291574 , ..., -0.00790847,
        -0.13957277,  0.08222426]])

In [17]:
reduced_features.shape

(507, 9)

In [21]:
# Create df with clusters only with reduced features and question_id
clustered_df = pd.DataFrame(reduced_features, columns=[f'feature_{i}' for i in range(reduced_features.shape[1])])
clustered_df['cluster'] = df.index

clustered_df['question_id'] = df['question_id']
clustered_df.head() 

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,cluster,question_id
0,1.704391,-0.793905,-0.232004,0.290996,-0.432697,-0.413886,-0.402586,-0.098916,0.010967,0,fc084299-5fff-4543-b5f6-5a7971b93da6
1,-0.709078,-0.773988,-0.458009,-0.307962,-0.4301,-0.7286,-0.385166,-0.135596,-0.01645,1,a6d25890-eecd-488a-8b89-2455786b3b80
2,1.221922,-0.672054,-0.190961,0.203715,-0.149207,-0.087071,0.108569,-0.042468,0.059995,2,3d9980e3-3d1c-4932-920e-f94a57340888
3,0.206422,-0.861544,-0.422531,-0.115542,-0.423638,-0.603388,-0.453547,-0.090575,-0.002099,3,21b80886-9511-4cd4-bde3-1cf1878f21a2
4,-0.186364,-0.863699,-0.460131,-0.208149,-0.434192,-0.650077,-0.470185,-0.090883,0.004823,4,239c4195-884a-49ea-80ea-45e18acbe1d4


In [25]:
feature_vector_df = clustered_df.sort_values(by='cluster').reset_index(drop=True)
feature_vector_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,cluster,question_id
0,1.704391,-0.793905,-0.232004,0.290996,-0.432697,-0.413886,-0.402586,-0.098916,0.010967,0,fc084299-5fff-4543-b5f6-5a7971b93da6
1,-0.709078,-0.773988,-0.458009,-0.307962,-0.430100,-0.728600,-0.385166,-0.135596,-0.016450,1,a6d25890-eecd-488a-8b89-2455786b3b80
2,1.221922,-0.672054,-0.190961,0.203715,-0.149207,-0.087071,0.108569,-0.042468,0.059995,2,3d9980e3-3d1c-4932-920e-f94a57340888
3,0.206422,-0.861544,-0.422531,-0.115542,-0.423638,-0.603388,-0.453547,-0.090575,-0.002099,3,21b80886-9511-4cd4-bde3-1cf1878f21a2
4,-0.186364,-0.863699,-0.460131,-0.208149,-0.434192,-0.650077,-0.470185,-0.090883,0.004823,4,239c4195-884a-49ea-80ea-45e18acbe1d4
...,...,...,...,...,...,...,...,...,...,...,...
502,0.465987,-0.106178,1.052660,-0.032835,-0.030374,0.047303,0.061625,0.032158,0.024083,502,1164b65d-1cba-8192-a242-e01d25e2bc9d
503,-0.035885,-0.100100,0.997865,-0.168078,-0.010531,0.004956,0.070764,0.049116,0.062588,503,1164b65d-1cba-81dc-b305-d078f5e25994
504,-1.262094,-0.144532,0.881059,-0.501853,0.016832,-0.187400,-0.041511,0.121436,0.004229,504,1164b65d-1cba-81dd-911a-c2546a76ad11
505,-0.039230,-0.114182,0.876301,-0.154400,0.027433,-0.019182,-0.072977,0.036849,-0.013946,505,1164b65d-1cba-8165-93e4-c5ad83afc837


In [23]:
clustered_df[clustered_df['cluster'] == 500].head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,cluster,question_id
500,-0.048484,-0.096387,0.868975,-0.130978,0.026642,-0.010248,-0.003726,-0.064814,-0.028552,500,1164b65d-1cba-81f6-827c-c7a45fe5198f


In [24]:
# get feature_vectors of each clusters from clustered_df and save it to a variable
features_vector = clustered_df[clustered_df['cluster'] == 500].iloc[:, :-2].values.tolist()
features_vector


[[-0.04848362927760954,
  -0.09638732351139713,
  0.868975137871121,
  -0.1309778843471694,
  0.02664234763324944,
  -0.010247705213857296,
  -0.0037260707655316205,
  -0.06481407313031017,
  -0.028551639442580396]]

In [None]:
# load npy file from src/tmp/feature_vectors/feture_vectors.npy
import numpy as np
feature_vectors = np.load('src/tmp/weights/content_based_model_weights.npy')
feature_vectors

array([[0.        , 1.94600345, 5.        , 4.3007177 ],
       [0.        , 2.40946412, 5.        , 4.665917  ],
       [0.        , 2.20437125, 5.        , 4.435107  ],
       ...,
       [0.        , 3.1716701 , 5.        , 4.87126231],
       [0.        , 2.93411453, 5.        , 4.65515965],
       [0.        , 2.8253692 , 5.        , 4.57316757]])

In [3]:
feature_vectors.shape

(507, 4)

In [4]:
print(len(feature_vectors))

507
