# TF-IDF for 3ml Stories

We build a story recommender using TF-IDF and cosine similarity.

The data is extracted from the 3ml database from within the `psql` repl using the command:

```
\copy (select id,title,content from story) to stories.csv with csv delimiter ',' HEADER
```

or alternatively:

```
psql --csv -d my3ml -o stories.csv -c 'select id,title,content from story'
```

In [104]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
#string.punctuation

[nltk_data] Downloading package stopwords to /home/tekul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tekul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [105]:
def clean_text(document):
    document = document.replace('&quot;', '')
    document = document.replace('_', '')
    tokens = word_tokenize(document)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)

In [106]:
stories = pd.read_csv('./stories.csv')
# Skip short content
stories['content'] = stories['content'].apply(clean_text)
stories = stories[stories['content'].apply(lambda x: len(str(x)) > 30)]
stories.reset_index(drop = True, inplace=True)
#stories.head()

In [107]:
indices = pd.Series(stories.index, index=stories['id'])
texts = stories['content']

vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(texts)
features = vectorizer.get_feature_names_out()
similarity = cosine_similarity(tfidf_matrix)

In [110]:
with open('features.txt', 'w') as f:
    for row in features:
        f.write(str(row))
        f.write('\n')
len(features)

4617

In [111]:
muscles = stories[stories['title'] == 'Making muscles move']

In [112]:
def get_similar_stories(story):
    id = story['id']
    # Get the actual index in the stories dataframe from the id
    index = indices.loc[id]
    # Get the similarity scores for each story in the corpus
    # enumerate gives us (index, value) tuples so we don't lose the position after sorting
    scores = list(enumerate(similarity[index]))
    # Sort and take the first ten closet matches
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:11]
    # Get the indices of the closest matches and user those to get the ids
    sim_indices = [s[0] for s in scores]
    return list(stories['id'].iloc[sim_indices])

In [113]:
get_similar_stories(stories.iloc[0])

[968, 964, 963, 965, 985, 966, 967, 989, 988, 969]

In [114]:
stories['nn'] = stories.apply(get_similar_stories, axis=1)

In [115]:
result = stories.drop(columns=['content', 'title'])

In [116]:
result.to_json(path_or_buf="stories_nn.json", orient='records')