In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [15]:
df = pd.read_csv('news_list.csv',low_memory=False)
# Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [16]:
# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['description'])
# Print the shape of the tfidf_matrix
tfidf_matrix.shape

(8443, 21709)

In [17]:
# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend = 10):
    idx = indices[title]
# Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
    top_similar = sim_scores[1:num_recommend+1]
# Get the movie indices
    movie_indices = [i[0] for i in top_similar]
# Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [19]:
def get_recommendations(keyword, cosine_sim=cosine_sim, num_recommend=10):
    # Search for movies containing the keyword in their title or description
    movie_matches = df[df['title'].str.contains(keyword, case=False) | df['description'].str.contains(keyword, case=False)]
    
    # If there are no matches, return a message
    if movie_matches.empty:
        return "No matches found for the keyword: {}".format(keyword)

    # Get the indices of the matching movies
    indices_list = movie_matches.index.tolist()

    # Calculate the average similarity scores for the matching movies
    avg_sim_scores = cosine_sim[indices_list].mean(axis=0)

    # Sort the movies based on the average similarity scores
    movie_indices = avg_sim_scores.argsort()[::-1][:num_recommend]

    # Return the top recommended movies
    return df['title'].iloc[movie_indices]

In [25]:
get_recommendations('business', num_recommend = 20)

7775    6 insider secrets for turning your business id...
7331    Explore the 3 most popular spots in New Orlean...
6421    13 money management mistakes small business ow...
4116    Operating costs: Understanding & reducing them...
4512    New yoga spot Big Shoulders Yoga now open in W...
7414             Actors and actresses with famous mothers
4182     Survey: US business hiring falls to a 7-year low
6792    These chains offer the best value in fast food...
7752    If you're a business traveler who prefers Marr...
4154    The 8 fast-food chains with the cleanest resta...
278            The Elite, The Meh, and The Ugly - Rutgers
5895             Doner Box brings German fare to Belltown
7991    Petra Cafe brings Greek fare to Keystone At Th...
2926    Local businesses affected after Kettering DP&L...
1316    Eric Tse, 24, just became a billionaire overnight
4385     Here's Why the Best Is Yet to Come for Microsoft
3840    Till may move back to welterweight, wants to a...
8102    Micros