# TF-IDF Vectorization and Cosine similarity

In [None]:
import numpy as np
import pandas as pd

## Load Data Files

In [None]:
# Training data
feed_info = pd.read_csv('../wechat_algo_data1/feed_info.csv')
user_action = pd.read_csv('../wechat_algo_data1/user_action.csv')

# Test data
test_data = pd.read_csv('../wechat_algo_data1/test_a.csv')

## Data Preprocessing

In [None]:
feed_info['ocr'] = feed_info['ocr'].fillna('')
feed_info['asr'] = feed_info['asr'].fillna('')
feed_info['description'] = feed_info['description'].fillna('')
feed_info['description_char'] = feed_info['description_char'].fillna('')
feed_info['manual_keyword_list'] = feed_info['manual_keyword_list'].fillna('')
feed_info['machine_keyword_list'] = feed_info['machine_keyword_list'].fillna('')
feed_info['manual_tag_list'] = feed_info['manual_tag_list'].fillna('')
feed_info['machine_tag_list'] = feed_info['machine_tag_list'].fillna('')

## Train Model

In [None]:
import pickle
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def calc_similarity(record, attribute: str, feed_info = feed_info, user_action = user_action):
    # Get the collection of keywords from the set of videos that the user has previously watched
    actions = user_action[user_action['userid'] == record['userid']]
    video_set = actions.merge(feed_info, on = ['feedid'])
    descriptions = video_set[attribute]

    # Create TF-IDF vector
    vectorizer = TfidfVectorizer()
    bag_of_words = vectorizer.fit_transform(descriptions)

    # Calculate pairwise cosine similarity
    cos_sim = cosine_similarity(bag_of_words, bag_of_words)
    
    # Compute TF-IDF vector of keywords for the target video
    target_des = feed_info[feed_info['feedid'] == record['feedid']][attribute]
    target_vec = TfidfVectorizer(
        vocabulary = vectorizer.vocabulary_     # Vocabulary set of corpus
    ).fit_transform(target_des)
    # Compute pairwise similarty
    similarities = pd.DataFrame(cosine_similarity(bag_of_words, target_vec), columns = ['sim'])
    
    # Normalize the values and take the average
    similarities = ((similarities['sim'] + 1) / 2).mean()

    return similarities

## Make Predictions

In [None]:
target_attribute = 'asr'

In [None]:
%time description_prob_df = test_data.apply(calc_similarity, attribute = target_attribute, axis = 1)

In [None]:
output_data = pd.DataFrame({target_attribute + '_sim': description_prob_df})

## Save Results

In [None]:
# Save results
output_data.to_csv('../predictions/2.2_%s_similarity_01.csv' % target_attribute, index = False)