In [23]:
import numpy as np
import pandas as pd
import ast
import re
from sklearn.preprocessing import OneHotEncoder

In [2]:
questions_df = pd.read_csv('src/data/questions_cluster.csv')
questions_df = questions_df.drop(columns=['question_id', 'concept', 'content'])
def clean_and_convert_embedding(embedding_str):
    cleaned_str = re.sub(r'[\[\]]', '', embedding_str)
    return np.fromstring(cleaned_str, sep=' ')

questions_df['concept_embedding'] = questions_df['concept_embedding'].apply(clean_and_convert_embedding)

questions_df

Unnamed: 0,chapter,difficulty,concept_embedding,cluster
0,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
1,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
2,chuong-3,0.0,"[0.22587502, -0.33934802]",21
3,chuong-3,0.5,"[-0.4651475, -0.35584044]",35
4,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
...,...,...,...,...
502,chuong-4,0.5,"[0.3655883, 0.2535131]",65
503,chuong-4,0.0,"[0.3655883, 0.2535131]",4
504,chuong-4,0.0,"[0.3655883, 0.2535131]",4
505,chuong-4,0.0,"[-0.07888263, 0.01606858]",75


In [30]:
cluster_features = questions_df.groupby('cluster').apply(lambda x: pd.Series({
    'mean_difficulty': x['difficulty'].mean(),
    'mean_concept_embedding': np.mean(np.stack(x['concept_embedding']), axis=0)
})).reset_index()

# add mean_difficulty to mean_concept_embedding np array
cluster_features['features_vector'] = cluster_features.apply(lambda x: np.append(x['mean_concept_embedding'], x['mean_difficulty']), axis=1)
cluster_features = cluster_features.drop(columns=['mean_difficulty', 'mean_concept_embedding'])
cluster_features

  cluster_features = questions_df.groupby('cluster').apply(lambda x: pd.Series({


Unnamed: 0,cluster,features_vector
0,0,"[-0.026811360000000003, 0.011821570000000002, ..."
1,1,"[0.25516747999999995, 0.45046364999999977, 0.0]"
2,2,"[-0.41426075, -0.4724409, 0.445]"
3,3,"[-0.04732007, 0.28842866, 0.5]"
4,4,"[0.36558829999999987, 0.25351310000000005, 0.0]"
...,...,...
121,121,"[0.40387183, -0.2965448, 0.0]"
122,122,"[0.31754452, -0.1702683, 0.5]"
123,123,"[-0.41426075, -0.4724409, 0.3]"
124,124,"[-0.22683066, 0.32770258, 0.29]"


In [25]:
questions_df.head(30)

Unnamed: 0,chapter,difficulty,concept_embedding,cluster
0,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
1,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
2,chuong-3,0.0,"[0.22587502, -0.33934802]",21
3,chuong-3,0.5,"[-0.4651475, -0.35584044]",35
4,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
5,chuong-3,0.0,"[-0.48017752, 0.25036466]",44
6,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
7,chuong-3,0.78,"[-0.11668843, -0.09688705]",40
8,chuong-3,0.0,"[-0.4651475, -0.35584044]",5
9,chuong-3,0.0,"[-0.11668843, -0.09688705]",76


In [28]:
type(questions_df['concept_embedding'].iloc[0])

numpy.ndarray

In [24]:
encoder = OneHotEncoder()
chapter_encoded = encoder.fit_transform(questions_df[['chapter']]).toarray()
concept_embeddings = np.vstack(questions_df['concept_embedding'].values)
difficulties = questions_df['difficulty'].values.reshape(-1, 1)
print(chapter_encoded.shape)
print(concept_embeddings.shape)
print(difficulties.shape)

(507, 4)
(507, 2)
(507, 1)


In [4]:
difficulties[0:10]

array([[0.  ],
       [0.  ],
       [0.  ],
       [0.5 ],
       [0.  ],
       [0.  ],
       [0.  ],
       [0.78],
       [0.  ],
       [0.  ]])

In [5]:
concept_embeddings

array([[-0.4651475 , -0.35584044],
       [-0.4651475 , -0.35584044],
       [ 0.22587502, -0.33934802],
       ...,
       [ 0.3655883 ,  0.2535131 ],
       [-0.07888263,  0.01606858],
       [ 0.14382899,  0.04959369]])

In [6]:
feature_vectors = np.hstack([
    concept_embeddings,
    chapter_encoded,
    questions_df['difficulty'].values.reshape(-1, 1)
])

In [7]:
print(feature_vectors[0:10])

[[-0.4651475  -0.35584044  0.          0.          1.          0.
   0.        ]
 [-0.4651475  -0.35584044  0.          0.          1.          0.
   0.        ]
 [ 0.22587502 -0.33934802  0.          0.          1.          0.
   0.        ]
 [-0.4651475  -0.35584044  0.          0.          1.          0.
   0.5       ]
 [-0.4651475  -0.35584044  0.          0.          1.          0.
   0.        ]
 [-0.48017752  0.25036466  0.          0.          1.          0.
   0.        ]
 [-0.4651475  -0.35584044  0.          0.          1.          0.
   0.        ]
 [-0.11668843 -0.09688705  0.          0.          1.          0.
   0.78      ]
 [-0.4651475  -0.35584044  0.          0.          1.          0.
   0.        ]
 [-0.11668843 -0.09688705  0.          0.          1.          0.
   0.        ]]


In [8]:
logs_df = pd.read_csv('src/data/logs.csv')
logs_df.head(5)

Unnamed: 0,user_id,question_id,chapter,concept,difficulty,score,timecost,created_at
0,669d16e11db84069209550bd,3b6fd0b3-414b-464e-83f1-404d78be26c9,chuong-1,mang-may-tinh,0.25,1,4648,2024-09-12 05:59:50.980
1,669d16e11db84069209550bd,0304f6be-0591-419e-a6cb-c10db2c1cbeb,chuong-1,cong-nghe-thong-tin,0.663922,1,8356,2024-09-12 06:00:55.467
2,669d16e11db84069209550bd,4d6363a0-63fb-4e4f-b6f8-0e3d95e99065,chuong-1,thong-tin-xu-ly-thong-tin,0.25,1,2785,2024-09-12 06:00:58.638
3,669d16e11db84069209550bd,43d9a5e4-ffb4-4224-8110-648902976486,chuong-1,giai-quyet-bai-toan-tren-mtdt,0.455556,1,8624,2024-09-12 06:01:10.907
4,669d16e11db84069209550bd,76cee16c-cffb-4dce-a2b2-94a0ae2f87c3,chuong-1,mang-may-tinh,0.416667,0,23039,2024-09-12 06:01:34.642


In [15]:
import json

# open json question_maps
with open('src/data/question_map.json') as f:
    question_map = json.load(f)

with open('src/data/user_map.json') as f:
    user_map = json.load(f)


In [16]:
logs_df['cluster'] = logs_df['question_id'].map(question_map)
logs_df.dropna(subset=['cluster'], inplace=True)
logs_df['cluster'] = logs_df['cluster'].astype(int)

rev_user_map = {v: k for k, v in user_map.items()}
logs_df['user_id'] = logs_df['user_id'].map(rev_user_map)

In [21]:
logs_df.tail(10)

Unnamed: 0,user_id,question_id,chapter,concept,difficulty,score,timecost,created_at,cluster
5890,177,0e828da6-faf2-4b31-8776-96b937b6eb28,chuong-1,cong-nghe-thong-tin,0.440217,0,2477,2024-11-10 04:06:46.840,13
5891,177,a8137789-6f8b-4fb6-86fe-08139c362b6e,chuong-1,mang-may-tinh,0.25,1,1042,2024-11-10 04:06:53.796,37
5892,177,39c9470f-2f16-4a1c-829f-880a14379b6a,chuong-1,giai-quyet-bai-toan-tren-mtdt,0.5,0,1602,2024-11-10 04:08:23.608,23
5893,177,c390672d-fe37-419a-b76c-5512f48fc970,chuong-1,may-tinh-dien-tu,0.418367,0,1894,2024-11-10 04:08:40.266,80
5894,177,1f8f452c-d363-42f5-aac9-911c66770d66,chuong-1,thong-tin-xu-ly-thong-tin,0.25,0,608,2024-11-10 04:09:38.169,15
5895,177,3c22d010-60c4-4c84-ba34-6eb342aa451d,chuong-1,giai-quyet-bai-toan-tren-mtdt,0.466216,1,1570,2024-11-10 08:31:14.594,23
5896,177,383f0ab4-a63e-4610-a362-0d9a88659ae4,chuong-2,bieu-dien-ki-tu,0.5,0,722,2024-11-10 08:31:20.411,122
5897,177,0a09b85b-dcc8-48e6-be9d-d5d4099621f4,chuong-2,bo-nho,0.5,1,2560,2024-11-10 08:52:54.787,3
5898,177,72805390-821a-4873-a17a-a0b06307c779,chuong-2,he-dem-co-so-r,1.0,1,3500,2024-11-10 08:54:51.955,25
5899,177,539da41d-f192-48af-8947-f7623bb8a743,chuong-2,so-thuc,0.4375,0,644,2024-11-10 11:07:05.073,90


In [22]:
user_stats = logs_df.groupby('user_id').agg(
    avg_score=('score', 'mean'),
    avg_difficulty=('difficulty', 'mean'),
    avg_timecost=('timecost', 'mean')
).reset_index()

user_stats

Unnamed: 0,user_id,avg_score,avg_difficulty,avg_timecost
0,0,0.531915,0.375687,5844.617021
1,1,0.322034,0.362592,699.949153
2,10,0.444444,0.365884,1109.074074
3,100,0.500000,0.368243,14533.000000
4,101,0.418605,0.381966,5212.162791
...,...,...,...,...
226,95,0.333333,0.631944,6083.666667
227,96,0.000000,0.455556,537.000000
228,97,0.600000,0.363003,96.366667
229,98,0.000000,0.250000,41.000000


In [35]:
import pandas as pd
import random

# Generate sample data
user_ids = [i for i in range(1, 301)]  # 100 users
clusters = [i for i in range(1, 125)]  # 20 clusters
ratings = [random.uniform(1, 5) for _ in range(1000)]  # Random ratings between 1 and 5

# Create a list of dictionaries with user_id, cluster, and rating
data = []
for user_id in user_ids:
    num_records = random.randint(30, 50)
    sampled_clusters = random.sample(clusters, num_records)
    for cluster in sampled_clusters:
        data.append({
            "user_id": user_id,
            "cluster": cluster,
            "rating": random.choice(ratings)
        })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("user_cluster_ratings.csv", index=False)

print("CSV file 'user_cluster_ratings.csv' generated successfully.")

CSV file 'user_cluster_ratings.csv' generated successfully.
