# Recommendation Engines
_____

### Finding Similar Documents

In [None]:
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer() # or TFIDF 
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_trump', 'doc_election', 'doc_putin'])

from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

### Implicit Data Example - Using Cosine Similarity

In [None]:
def build_user_item_matrix(max_users: str, item: str) -> None:
        """Build User/Item Interaction Matrix"""
        matrix = np.zeros(shape=(max_users, max(self.data[item])))
        for _, row in self.data.iterrows():
            matrix[row["user_handle"] - 1, row[item] - 1] = 1
        return matrix

def get_user_item_matrix(max_users: int, features: List[str]):
    """Concatenate Features into One User-Items Matrix"""
    results = []
    for item in features:
        results.append(self.build_user_item_matrix(max_users, item))
    matrix = np.hstack(results)
    return matrix

def _truncatedSVD(self, threshold: float = 0.90) -> np.ndarray:
    """Apply Truncated SVD to Explain 'n'% of total variance"""
    n_components = 2  # minimum components to begin
    ex_var = 0
    while ex_var < threshold:
        pc = TruncatedSVD(n_components=n_components)
        pc.fit_transform(self.matrix)
        ex_var = np.sum(pc.explained_variance_ratio_)
        n_components += 1
    logging.info(
        f"Total components {pc.n_components} with {ex_var:0.2f} variance explained"
    )
    self.matrix= pc.transform(self.matrix)

def compute_similarity(self, metric: str = "cosine") -> np.ndarray:
    """Compute Similarity Matrix"""
    score = pairwise_distances(self.matrix, metric=metric)
    if metric == "cosine":
        return 1 - score
    return score

### Rank users

In [None]:
def custom_udf(X):
    """
    Custom Pandas function for using index/score to
    generate output results dataframe.
    """
    idx = np.argsort(X.values, axis=0)[::-1][1 : top_n + 1]
    return [
        str({"user": i, "score": X.astype(float).round(4).values[i]}) for i in idx
    ]

ranking = X.apply(custom_udf).T
ranking.columns = [f"{i+1}" for i in ranking.columns]