In [1]:
# importing libraries
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# reading file
univ_description = pd.read_csv('description.csv', encoding = 'latin-1')

In [3]:
# checking if we have the right data
univ_description.head()

Unnamed: 0,university_id,university_name
0,1003,Alabama A & M U
1,3840,Alabama Library Association
2,1248,Alabama Public Library Service
3,1006,Alabama State U
4,9776,Amridge U


In [4]:
# removing the stop words
univ_tfidf = TfidfVectorizer(stop_words='english')
# replace NaN with empty strings
univ_description['university_name'] = univ_description['university_name'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
univ_description_matrix = univ_tfidf.fit_transform(univ_description['university_name'])

univ_description_matrix

<97x139 sparse matrix of type '<class 'numpy.float64'>'
	with 261 stored elements in Compressed Sparse Row format>

In [5]:
# Let's check the shape of computed matrix
univ_description_matrix.shape


(97, 139)

In [6]:
# compuing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(univ_description_matrix, univ_description_matrix)
#cosine_similarity.describe()


In [7]:

# Get the pairwsie similarity scores of all universities compared to the univ passed by index, sorting them and getting top 5
# here 2 is the index of the univ in dataset
similarity_scores = list(enumerate(cosine_similarity[2]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[1:5]                                     

# Get the similar university index
univ_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar universities using integer-location based indexing (iloc)
print (univ_description['university_name'].iloc[univ_index])


1     Alabama Library Association
0                 Alabama A & M U
3                 Alabama State U
34           U Alabama Birmingham
Name: university_name, dtype: object
