In [2]:
!pip install rake_nltk



In [6]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', 100)
df = pd.read_csv('challenge_data2.csv')
df.head()

Unnamed: 0,0,_id,title,description,challenge,category
0,1,60b54873f81c532bd894292b,Fast fashion,Rejecting fast fashion could prevent an estima...,This week you will reflect on your shopping ha...,Recycling
1,2,60b54873f81c532bd8942929,Home energy,With household energy accounting for around 30...,During this week you should pay extra attentio...,Energy
2,3,60b54873f81c532bd8942920,Eat less meat and dairy,Avoiding meat and dairy products is one of the...,For the next week you are going to eat meat an...,Food
3,4,60b54873f81c532bd8942923,Conscious nutrition,It is important to be mindful of the water tha...,"Respecting your budget, try to maximize the pe...",Food
4,5,60b54873f81c532bd8942924,Raw,"Not all foods should be eaten raw, but many ve...",This week you should eat 3 meals with a 75% or...,Food


In [7]:
df = df[['title', 'challenge','category']]
df.head()

Unnamed: 0,title,challenge,category
0,Fast fashion,This week you will reflect on your shopping ha...,Recycling
1,Home energy,During this week you should pay extra attentio...,Energy
2,Eat less meat and dairy,For the next week you are going to eat meat an...,Food
3,Conscious nutrition,"Respecting your budget, try to maximize the pe...",Food
4,Raw,This week you should eat 3 meals with a 75% or...,Food


In [8]:
#initialize new column
df['Key_words'] = ""

for index, row in df.iterrows():
  challenge = row['challenge']
  
  #initiate rake
  r = Rake()

  r.extract_keywords_from_text(challenge)

  key_words_dict_scores = r.get_word_degrees()

  row['Key_words'] = list(key_words_dict_scores.keys())

df.drop(columns = ['challenge'], inplace=True)
df.head()

Unnamed: 0,title,category,Key_words
0,Fast fashion,Recycling,"[recycled, fabrics, local, week, new, apparel,..."
1,Home energy,Energy,"[leaving, also, includes, week, low, temperatu..."
2,Eat less meat and dairy,Food,"[dairy, product, going, eat, meat, next, week,..."
3,Conscious nutrition,Food,"[try, organic, groceries, percentage, week, re..."
4,Raw,Food,"[week, raw, vegetables, eat, 3, meals, 75]"


In [9]:
df.set_index('title', inplace=True)
df.head()

Unnamed: 0_level_0,category,Key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Fast fashion,Recycling,"[recycled, fabrics, local, week, new, apparel,..."
Home energy,Energy,"[leaving, also, includes, week, low, temperatu..."
Eat less meat and dairy,Food,"[dairy, product, going, eat, meat, next, week,..."
Conscious nutrition,Food,"[try, organic, groceries, percentage, week, re..."
Raw,Food,"[week, raw, vegetables, eat, 3, meals, 75]"


In [10]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
  words = ''
  for col in columns:
    if col != 'category':
      words = words + ' '.join(row[col])+ ' '
    else:
      words = words + row[col]+ ' '
  row['bag_of_words'] = words


df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [11]:
df.head()

Unnamed: 0_level_0,bag_of_words
title,Unnamed: 1_level_1
Fast fashion,Recycling recycled fabrics local week new appa...
Home energy,Energy leaving also includes week low temperat...
Eat less meat and dairy,Food dairy product going eat meat next week tw...
Conscious nutrition,Food try organic groceries percentage week res...
Raw,Food week raw vegetables eat 3 meals 75


In [12]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

indices = pd.Series(df.index)
indices[:5]
#count_matrix

0               Fast fashion
1                Home energy
2    Eat less meat and dairy
3        Conscious nutrition
4                        Raw
Name: title, dtype: object

In [13]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.08998425, 0.06537205, 0.12403473, 0.07412493,
        0.06201737, 0.05913124, 0.13867505, 0.05387725, 0.07161149,
        0.12838815, 0.04499213, 0.03846154],
       [0.08998425, 1.        , 0.07647191, 0.07254763, 0.086711  ,
        0.07254763, 0.13834289, 0.10814761, 0.1260507 , 0.08377078,
        0.10012523, 0.05263158, 0.08998425],
       [0.06537205, 0.07647191, 1.        , 0.21081851, 0.37796447,
        0.31622777, 0.10050378, 0.31426968, 0.04578685, 0.12171612,
        0.0727393 , 0.07647191, 0.06537205],
       [0.12403473, 0.07254763, 0.21081851, 1.        , 0.23904572,
        0.3       , 0.28603878, 0.2236068 , 0.04343722, 0.05773503,
        0.13801311, 0.07254763, 0.06201737],
       [0.07412493, 0.086711  , 0.37796447, 0.23904572, 1.        ,
        0.35856858, 0.11396058, 0.26726124, 0.05191741, 0.06900656,
        0.08247861, 0.086711  , 0.07412493],
       [0.06201737, 0.07254763, 0.31622777, 0.3       , 0.35856858,
        1.        , 0.19069

In [14]:
cosine_sim.shape

(13, 13)

In [15]:
#function that take category or title search as input
def recommendations(title, cosine_sim = cosine_sim):

  recommended_challenge = []

  idx = indices[indices == title].index[0]

  score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

  top_3_indexes = list(score_series.iloc[1:4].index)

  for i in top_3_indexes:
    recommended_challenge.append((df.index[i]))

  return recommended_challenge

In [17]:
recommendations('Fast fashion')

['Local and seasonal food', 'Single use', 'Conscious nutrition']