# **1. Load Necessary Libraries**

In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import re

# **2. Load and Preprocess Data**

In [72]:
# Define file paths
question_path = "/content/drive/MyDrive/dataset/question.csv"
users_path = "/content/drive/MyDrive/dataset/user.csv"
question_user_path = "/content/drive/MyDrive/dataset/interaction.csv"
job_titles_path = "/content/drive/MyDrive/dataset/jobTitles.csv"

In [73]:
# Load datasets
dfQuestion = pd.read_csv(question_path)
dfUsers = pd.read_csv(users_path)
dfQuestionUser = pd.read_csv(question_user_path)
dfJobTitles = pd.read_csv(job_titles_path)

In [74]:
# Drop unnecessary columns
#dfUsers.drop(columns=["Timestamp"], inplace=True)

In [75]:
# Convert 'timeTaken' to numeric and fill NaN with median
dfQuestionUser['time_taken'] = pd.to_numeric(dfQuestionUser['time_taken'], errors='coerce')
dfQuestionUser['time_taken'].fillna(dfQuestionUser['time_taken'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfQuestionUser['time_taken'].fillna(dfQuestionUser['time_taken'].median(), inplace=True)


In [76]:
# Min-Max Normalization per user
minmax_scaler = MinMaxScaler()
dfQuestionUser['timeTaken_minmax'] = dfQuestionUser.groupby('user_id')['time_taken'].transform(
    lambda x: minmax_scaler.fit_transform(x.values.reshape(-1, 1)).flatten() if len(x) > 1 else 0
)


In [77]:
# Convert 'correct' column to numeric values
dfQuestionUser['answerd_correctly'] = dfQuestionUser['answerd_correctly'].map({'Yes': 1, 'No': 0})

In [78]:
# Encode difficulty levels numerically
difficulty_mapping = {'Easy': 1, 'Medium': 2, 'Hard': 3}
dfQuestion['difficulty_encoded'] = dfQuestion['difficulty_level'].map(difficulty_mapping)

# Merge difficulty encoding into interactions dataset
dfQuestionUser = dfQuestionUser.merge(dfQuestion[['question_id', 'difficulty_encoded']], on='question_id', how='left')


# **3. Collaborative Filtering (CF)**

In [79]:
# Compute weighted interaction score
dfQuestionUser['weighted_score'] = (
    dfQuestionUser['answerd_correctly'] + dfQuestionUser['timeTaken_minmax'] + dfQuestionUser['difficulty_encoded']
) / 3  # Averaging approach

In [80]:
# Create interaction matrix
interaction_matrix = dfQuestionUser.pivot(index='user_id', columns='question_id', values='weighted_score').fillna(0)
interaction_matrix_np = interaction_matrix.values.astype(float)

In [81]:
# Apply KNN-based Collaborative Filtering
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(interaction_matrix_np)

In [82]:
# Apply SVD (Singular Value Decomposition)
U, sigma, Vt = svds(interaction_matrix_np, k=5)
sigma = np.diag(sigma)
predicted_scores = np.dot(np.dot(U, sigma), Vt)
predicted_df = pd.DataFrame(predicted_scores, index=interaction_matrix.index, columns=interaction_matrix.columns)


In [83]:
# Function to get answered questions
def get_answered_questions(user_id):
    return set(dfQuestionUser[dfQuestionUser['user_id'] == user_id]['question_id'].tolist())


In [84]:
# CF Recommendation Function
def recommend_questions_collab(user_id, num_questions=5):
    if user_id not in predicted_df.index:
        return []

    answered_questions = get_answered_questions(user_id)
    sorted_questions = predicted_df.loc[user_id].sort_values(ascending=False)
    recommended_questions = [q for q in sorted_questions.index if q not in answered_questions]

    return recommended_questions[:num_questions]


# **4. Content-Based Filtering (CBF)**

In [85]:
# Combine textual features
dfQuestion.fillna('', inplace=True)
dfQuestion['combined'] = dfQuestion[['topic', 'tags', 'question']].apply(lambda x: ' '.join(x), axis=1)


In [86]:
# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
question_tfidf_matrix = vectorizer.fit_transform(dfQuestion['combined'])

In [87]:
# Apply Latent Semantic Analysis (LSA) for dimensionality reduction
lsa = TruncatedSVD(n_components=50)
question_tfidf_lsa = lsa.fit_transform(question_tfidf_matrix)

In [88]:
# Compute Cosine Similarity
question_sim_df = pd.DataFrame(cosine_similarity(question_tfidf_lsa),
                               index=dfQuestion['question_id'],
                               columns=dfQuestion['question_id'])

In [89]:
# Strip whitespace from column names
dfUsers.columns = dfUsers.columns.str.strip()

In [90]:
# CBF Recommendation Function
def recommend_questions_content(user_id, num_questions=5):
    user_data = dfUsers[dfUsers['user_id'] == user_id]
    if user_data.empty:
        return []

    answered_questions = get_answered_questions(user_id)

    user_prefs = ' '.join(user_data[['familiar_technologies']].values[0])
    user_vector = vectorizer.transform([user_prefs])

    similarity_scores = cosine_similarity(user_vector, question_tfidf_matrix).flatten()
    sorted_indices = similarity_scores.argsort()[::-1]

    recommended_questions = [
        dfQuestion.iloc[i]['question_id']
        for i in sorted_indices
        if dfQuestion.iloc[i]['question_id'] not in answered_questions
    ]

    # Convert np.int64 to regular int
    return [int(qid) for qid in recommended_questions[:num_questions]]


In [91]:
# Bandit-Based Recommender
class QuestionBanditRecommender:
    def __init__(self, questions_df, users_df, interactions_df):
        self.questions_df = questions_df
        self.users_df = users_df
        self.interactions_df = interactions_df
        self.successes = {}
        self.attempts = {}
        for qid in questions_df['question_id']:
            self.successes[qid] = 0
            self.attempts[qid] = 1
        for _, row in interactions_df.iterrows():
            qid = row['question_id']
            self.attempts[qid] += 1
            if row['answerd_correctly'] == 1:
                time = row['time_taken']
                reward = 1 if time <= 30 else 0.8 if time <= 60 else 0.5
                self.successes[qid] += reward

    def ucb_score(self, qid, total_attempts):
        mean_reward = self.successes[qid] / self.attempts[qid]
        return mean_reward + np.sqrt((2 * np.log(total_attempts)) / self.attempts[qid])

    def recommend(self, user_id, top_n=5):
        user_row = self.users_df[self.users_df['user_id'] == user_id]
        if user_row.empty:
            return pd.DataFrame()
        user = user_row.iloc[0]
        user_techs = [t.strip().lower() for t in str(user['familiar_technologies']).split(',')]
        user_level = str(user['expertise_level']).strip().lower()
        user_answers = self.interactions_df[self.interactions_df['user_id'] == user_id]
        answered_qids = set(user_answers['question_id'])
        available_questions = self.questions_df[~self.questions_df['question_id'].isin(answered_qids)]
        available_qids = available_questions['question_id'].tolist()
        total_attempts = sum(self.attempts.values())

        scored_qids = []
        for qid in available_qids:
            score = self.ucb_score(qid, total_attempts)
            question_row = self.questions_df[self.questions_df['question_id'] == qid].iloc[0]
            question_tags = [t.strip().lower() for t in str(question_row['tags']).split(',')]
            topic_match = any(tech in question_tags for tech in user_techs)
            difficulty_match = str(question_row['difficulty_level']).strip().lower() == user_level
            if topic_match:
                score *= 1.2
            if difficulty_match:
                score *= 1.1
            scored_qids.append((qid, score))

        scored_qids.sort(key=lambda x: x[1], reverse=True)
        top_questions = [qid for qid, _ in scored_qids[:top_n]]
        return self.questions_df[self.questions_df['question_id'].isin(top_questions)]

# Instantiate recommender
recommender = QuestionBanditRecommender(dfQuestion, dfUsers, dfQuestionUser)

In [92]:
# Clean Job Titles
unwanted_keywords = ["se", "software engineer", "associate", "full stack", "fullstack", "developer", "designer", "engineer"]
pattern = r'\b(?:' + '|'.join(map(re.escape, unwanted_keywords)) + r')\b'

dfJobTitles['Job Title'] = dfJobTitles['Job Title'].str.lower().str.replace(pattern, '', regex=True)
dfJobTitles['Job Title'] = dfJobTitles['Job Title'].str.replace(r'\s+', ' ', regex=True).str.strip()
job_keywords = dfJobTitles['Job Title'][dfJobTitles['Job Title'].str.strip() != ''].unique().tolist()

# Add Job Title Matching Score
dfQuestion = dfQuestion.applymap(lambda x: x.lower() if isinstance(x, str) else x)

def matches_job_title(row):
    return any(any(keyword in str(row[col]) for keyword in job_keywords) for col in ['question', 'topic', 'tags'])

dfQuestion['job_title_match'] = dfQuestion.apply(matches_job_title, axis=1).astype(int)

# Display Job Title-Based Recommendations (only)
def recommend_questions_job_title_only(user_id, num_questions=10):
    answered = get_answered_questions(user_id)
    job_related_questions = dfQuestion[(dfQuestion['job_title_match'] == 1) & (~dfQuestion['question_id'].isin(answered))]
    return job_related_questions.head(num_questions)['question_id'].tolist()


# print(dfQuestion.head(100))

  dfQuestion = dfQuestion.applymap(lambda x: x.lower() if isinstance(x, str) else x)


# **5. Hybrid Recommendation System**

In [93]:
# Hybrid Recommendation
def hybrid_recommendations(user_id, num_questions=10, alpha=0.35, beta=0.25, gamma=0.25, delta=0.15):
    collab_recs = recommend_questions_collab(user_id, num_questions * 2)
    content_recs = recommend_questions_content(user_id, num_questions * 2)
    bandit_df = recommender.recommend(user_id, top_n=num_questions * 2)
    bandit_recs = bandit_df['question_id'].tolist()

    score_dict = {}
    for i, qid in enumerate(collab_recs):
        score_dict[qid] = score_dict.get(qid, 0) + alpha * (1 / (i + 1))
    for i, qid in enumerate(content_recs):
        score_dict[qid] = score_dict.get(qid, 0) + beta * (1 / (i + 1))
    for i, qid in enumerate(bandit_recs):
        score_dict[qid] = score_dict.get(qid, 0) + gamma * (1 / (i + 1))
    for qid in score_dict:
        if dfQuestion[dfQuestion['question_id'] == qid]['job_title_match'].values[0]:
            score_dict[qid] += delta

    sorted_questions = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top_ids = [qid for qid, _ in sorted_questions[:num_questions]]
    return dfQuestion[dfQuestion['question_id'].isin(top_ids)]


# **6. Test the Recommendation System**

In [94]:
# Test
user_id = 2
print("Collaborative Filtering Recommendations:", recommend_questions_collab(user_id))
print("Content-Based Filtering Recommendations:", recommend_questions_content(user_id))
bandit_recs = recommender.recommend(user_id)
bandit_ids = bandit_recs['question_id'].tolist()
print("Reinforcement Learning Recommendations:", bandit_ids)
print("Job Title-Based Recommendations:", recommend_questions_job_title_only(user_id))

print("Hybrid Recommendations (with job-title boosting):", hybrid_recommendations(user_id)['question_id'].tolist())

Collaborative Filtering Recommendations: [237, 111, 48, 300, 174]
Content-Based Filtering Recommendations: [21, 153, 79, 177, 199]
Reinforcement Learning Recommendations: [5, 7, 12, 19, 21]
Job Title-Based Recommendations: [1, 3, 9, 10, 13, 16, 20, 28, 31, 37]
Hybrid Recommendations (with job-title boosting): [5, 21, 67, 70, 73, 111, 142, 145, 205, 237]
