In [30]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

file_path_tfidf = "clean_data_with_tags_comments_and_one_hot.csv"
tfidf_data = pd.read_csv(file_path_tfidf)

In [31]:
reviews = tfidf_data['Reviews'].fillna('')  

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)

cosine_sim = cosine_similarity(tfidf_matrix)

def recommend_professors(professor_name, top_n=5):
    # Normalize the input professor name to title case
    professor_name = professor_name.title()
    
    # Check if professor exists in the dataset
    if professor_name not in tfidf_data['First Name'].values:
        return f"Professor {professor_name} not found in the dataset."
    
    # Get the index of the professor in the dataframe
    prof_index = tfidf_data[tfidf_data['First Name'] == professor_name].index[0]
    
    # Compute similarity scores and sort them
    similarity_scores = list(enumerate(cosine_sim[prof_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the professor themselves and retrieve top n recommendations
    recommendations = similarity_scores[1:top_n + 1]
    
    # Build the output list of recommended professors
    recommended_profs = [(tfidf_data.iloc[i[0]]['First Name'], i[1]) for i in recommendations]
    return recommended_profs

# Example: Get top 10 recommendations for a specific professor
recommendations = recommend_professors("mohammad", top_n=10)
recommendations

[('Max', 0.49937324453622917),
 ('Chris', 0.49637869329867274),
 ('Mario', 0.49060635586433476),
 ('Grigorios', 0.48933110233686283),
 ('Ellen', 0.48771978937346544),
 ('Frederick', 0.47973096909279805),
 ('Sali', 0.4796892210462402),
 ('Brendan', 0.474275909764849),
 ('Cecelia', 0.47368049514750327),
 ('Robert', 0.4696452200643954)]