In [None]:
#Import necessary libraries
import pandas as pd
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
#Load data and rename column
df = pd.read_csv("organization_features.csv")

df.rename(columns={'Unnamed: 0':'name'}, inplace=True )

df.info()

In [None]:
#Create stop word list and run TFIDF on 'name', 'bio' and 'type' columns
my_stop_words = text.ENGLISH_STOP_WORDS.union(["Association", "Guild", "American", "Institute", "association", "guild", "american", "institute",
"Board", "Council", "Chamber", "board", "council", "chamber", "Alliance", "Society", "alliance", "society", "Club", "club", "Center", "Bureau", "center", "bureau"])

tfidf = TfidfVectorizer(stop_words= my_stop_words)

df = df.fillna('')

tfidf_name_matrix = tfidf.fit_transform(df['name'])

tfidf_bio_matrix = tfidf.fit_transform(df['bio'])

tfidf_type_matrix = tfidf.fit_transform(df['type'])

tfidf_bio_matrix.shape

In [None]:
#Apply linear kernal to matrixes
name_cosine_sim = linear_kernel(tfidf_name_matrix, tfidf_name_matrix)

bio_cosine_sim = linear_kernel(tfidf_bio_matrix, tfidf_bio_matrix)

type_cosine_sim = linear_kernel(tfidf_type_matrix, tfidf_type_matrix)

In [None]:
#Construct a reverse map of indices and org names
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

weights = [0.35, 0.6, 0.05]

def get_recommendations(name, name_cosine_sim=weights[0]*name_cosine_sim, bio_cosine_sim= weights[1]*bio_cosine_sim, type_cosine_sim=weights[2]*type_cosine_sim):
    # Get the index of the org that matches the name
    idx = indices[name]

    # Get the pairwsie similarity scores of all orgs
    name_sim_scores = list(enumerate(name_cosine_sim[idx]))

    bio_sim_scores = list(enumerate(bio_cosine_sim[idx]))

    type_sim_scores = list(enumerate(type_cosine_sim[idx]))

    sim_scores = (name_sim_scores) + (bio_sim_scores) + (type_sim_scores)

    # Sort the orgs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar orgs
    sim_scores = sim_scores[1:11]

    # Get the org indices
    name_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar org
    return df['name'].iloc[name_indices]

In [None]:
#Test
get_recommendations('American Institute of Commemorative Art')