# Product Recommendations using Semi-Supervised Learning 


In [None]:
def vector_similarities(v1, v2):
    # Compute number of matching fields (-1 does not match by default)
    agreement = sum([v1[i] == v2[i] for i in range(len(v1)) if v1[i] > -1 ])
    return agreement

def confidence(target, neighbor):
    # Many items do they have in common?
    return sum([1 for i in range(len(neighbor)) if neighbor[i] > -1 and target[i] > -1])

def fill_field(index, target, data, k):
    fill_data = [elem for elem in data if elem[index] > -1] # if the field is filled
    k = min(k, len(fill_data))
    similarities = [vector_similarities(target, data_elem) for data_elem in fill_data]
    confidences = [confidence(target, data_elem) for data_elem in fill_data]
    # take the k most similar items
    similarity_cutoff = list(reversed(sorted(similarities)))[k-1]
    sum_weights = sum([confidences[i] for i in range(len(fill_data)) if similarities[i]>=similarity_cutoff])
    sum_votes = sum([fill_data[i][index] * confidences[i] for i in range(len(fill_data)) if similarities[i]>=similarity_cutoff])
    return sum_votes/sum_weights, k

def fill_vector(query_vector, data, k):
    scores = []
    confidences = []
    for index in range(len(query_vector)):
        if query_vector[index] > -1:
            scores.append(query_vector[index])
            confidences.append(-1) # Meaning infinite confidence
        else:
            score, confidence = fill_field(index, query_vector, data, k)
            scores.append(score)
            confidences.append(confidence)
    return scores, confidences


In [None]:
# Computing nearest neighbors - an example
# -1 means field is missing

data = [[1,  1,  1,  1,  1,  1],
       [-1, -1, -1,  0,  1, -1],
       [-1, -1,  1,  0, -1, -1],
       [ 1,  1,  0, -1, -1, -1],
       [-1, -1,  1,  0,  1, -1]]

query_vector = [-1, 1, 1, -1, -1, -1]

In [None]:
scores, confidences = fill_vector(query_vector, data, 10)
print("Scores:", scores)
print("Confidences: ", confidences)

# Extracting Features from Natural Language

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
sample_texts = ["An epic space fantasy with love and explosions.",
               "A Programming How-To with great anecdotes.",
               "A How-To manual for Python.",
               "Epic battles, a dragon, and a love story.",
               "Train your very own dragon!"]

In [None]:
X = vectorizer.fit_transform(sample_texts)

In [None]:
print(vectorizer.get_feature_names())
print(X.toarray())

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

embedding = TSNE(n_components=2) # Here, the n_components is the number of dimensions, not number of clusters.
embedded_data = embedding.fit_transform(X.toarray()) # Creates 2-d embeddings of the original data.
print("Data Shape", embedded_data.shape)

# We can now plot the data:
plt.title("t-SNE embeddings of our data")
plt.scatter(embedded_data[:,0], embedded_data[:,1])
plt.show()

# A note on Active Learning

The informational value of making a recommendation will generally depend on the objective for recommendation and the way that information is extrapolated to future recommendations. One way is to compute across all user/item pairs that are estimated how much the uncertainty in the estimates will decrease if the additional cell is filled in. In practice, this may be quite expensive to compute, so a heuristic is to take the product of the square of the uncertainty in a particular pair, the number of non-empty fields in the column vector, and the number of non-empty fields in the row vector.

However, trading off the expected informational gain and the value of making recommendations that are already good is another issue entirely. These issues will be solved cleanly and from a slightly different mindset in Section 5, which discusses Reinforcement Learning.