In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/projects.csv')

In [3]:
df.head()

Unnamed: 0,id,title,tagline,description,userID,tags,category
0,41edbc3e-2636-4806-bd87-4c19f273bb3c,Test Project,some tagline,some description,5f5a90e2-6601-480c-bc08-7592e06beacd,"['coding', 'new-tag']",Health and Wellness
1,467f58b6-c640-493a-aceb-36dea9881778,To the Cosmos,Where Dreams Touch the Stars,Cosmos is a visionary project that propels you...,0db6b893-14aa-4ae5-8e57-692ca7943ab5,"['exploration', 'beyondearth', 'software', 'st...",Travel and Exploration
2,6f6235eb-a751-43f2-8bad-506c884635db,Cosmos,Enter the starline,"Lorem ipsum dolor sit, amet consectetur adipis...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"['astronomy', 'physics', 'website']",Astronomy
3,b5a0f61f-6ae4-4036-97c1-beb0b5605a7a,Space Explorer,To the space and beyond,Sky Explorer is an ambitious and groundbreakin...,0db6b893-14aa-4ae5-8e57-692ca7943ab5,"['space', 'exploration', 'app', 'spacetravel',...",Travel and Exploration
4,45ba1903-cafe-4e57-b7ec-1de402d32cc1,Interact,"create, connect, collaborate","Lorem ipsum dolor sit, amet consectetur adipis...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"['js', 'ts', 'students']",tech


In [4]:
df['tagline']=df['tagline'].apply(lambda x:x.split())
df['description']=df['description'].apply(lambda x:x.split())
df['category']=df['category'].apply(lambda x:x.split())

In [5]:
import ast

def parse(obj):
    try:
        obj = ast.literal_eval(obj)
        return obj
    except:
        return obj

df['tags']=df['tags'].apply(parse)

In [6]:
df['keys']=df['tagline']+df['description']+df['tags']+df['category']

In [7]:
df.head()

Unnamed: 0,id,title,tagline,description,userID,tags,category,keys
0,41edbc3e-2636-4806-bd87-4c19f273bb3c,Test Project,"[some, tagline]","[some, description]",5f5a90e2-6601-480c-bc08-7592e06beacd,"[coding, new-tag]","[Health, and, Wellness]","[some, tagline, some, description, coding, new..."
1,467f58b6-c640-493a-aceb-36dea9881778,To the Cosmos,"[Where, Dreams, Touch, the, Stars]","[Cosmos, is, a, visionary, project, that, prop...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"[exploration, beyondearth, software, stargazin...","[Travel, and, Exploration]","[Where, Dreams, Touch, the, Stars, Cosmos, is,..."
2,6f6235eb-a751-43f2-8bad-506c884635db,Cosmos,"[Enter, the, starline]","[Lorem, ipsum, dolor, sit,, amet, consectetur,...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"[astronomy, physics, website]",[Astronomy],"[Enter, the, starline, Lorem, ipsum, dolor, si..."
3,b5a0f61f-6ae4-4036-97c1-beb0b5605a7a,Space Explorer,"[To, the, space, and, beyond]","[Sky, Explorer, is, an, ambitious, and, ground...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"[space, exploration, app, spacetravel, nextgen]","[Travel, and, Exploration]","[To, the, space, and, beyond, Sky, Explorer, i..."
4,45ba1903-cafe-4e57-b7ec-1de402d32cc1,Interact,"[create,, connect,, collaborate]","[Lorem, ipsum, dolor, sit,, amet, consectetur,...",0db6b893-14aa-4ae5-8e57-692ca7943ab5,"[js, ts, students]",[tech],"[create,, connect,, collaborate, Lorem, ipsum,..."


In [8]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string

ps=PorterStemmer()

def stem(x):
    L = []

    for i in x:
        i=i.lower()
        if i not in L and i not in stopwords.words("english") and i not in string.punctuation:
            L.append(ps.stem(i.lower()))
    return " ".join(L)

In [9]:
df.loc[:,'keys']=df['keys'].apply(stem)

In [10]:
df=df[['id','title','keys']]

In [11]:
df.tail()

Unnamed: 0,id,title,keys
62,e8b38942-d6d0-41a4-a000-dfcf02207e24,Eco-Friendly Adventure Tours,explor natur respons offer eco-friendli advent...
63,84274436-16c5-45cc-98d4-0e8a7013aa2e,Green Transportation Solutions,eco-friendli commut made easi develop comprehe...
64,4468fecd-37cd-4f84-a998-ac868c0a3268,Sustainable Travel Accommodations,stay green explor curat network eco-friendli a...
65,8d366caf-ce12-4a6a-95ad-08b2cada8c18,Eco-Friendly Travel Booking Service,travel sustain us creat book platform promot e...
66,e00a9008-a1b4-4275-8d15-b5c977a5c7b6,Global Eco-Tourism Initiative,preserv natur respons travel launch global ini...


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)

In [13]:
vectors = cv.fit_transform(df['keys']).toarray()

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(vectors)

In [15]:
similarities.shape

(67, 67)

In [16]:
def recommend(project_id):
    try:
        movie_index = df[df['id'].str.lower()==project_id.lower()].index[0]
        distances = similarities[movie_index]
        movie_objs = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

        return [(df.iloc[i[0]].id, df.iloc[i[0]].title) for i in movie_objs]
    except:
        return []

In [17]:
# for project: Sustainable Travel Accommodations
recommend("4468fecd-37cd-4f84-a998-ac868c0a3268")

[('2340c93c-4e74-458a-b7f4-4f7546cb0bb5',
  'Eco-Friendly Travel Accommodations'),
 ('69aff6b3-d087-4685-a1c1-01cc579cffbc',
  'Eco-Friendly Travel Booking Service'),
 ('8d366caf-ce12-4a6a-95ad-08b2cada8c18',
  'Eco-Friendly Travel Booking Service'),
 ('b12370f6-2203-495f-b7d9-bead7917cb55', 'Sustainable Tourism Booking'),
 ('e8b38942-d6d0-41a4-a000-dfcf02207e24', 'Eco-Friendly Adventure Tours')]

In [19]:
import pickle

with open('../../models/projects/similarities.pickle', 'wb') as f:
    pickle.dump(similarities, f)