In [1]:
import numpy as np
import pandas as pd
import math

#### Prepare labeled data (SKIP if already done)

In [1]:
from labeling import Labeler

labeler = Labeler(12)

In [6]:
train_df = pd.read_csv('data/train.csv', index_col='row_id', dtype={"place_id": str})
train_df["label"] = train_df.apply(lambda r: labeler.get_label(r["x"], r["y"]), axis=1)
train_df.to_csv("data/train-labeled-12.csv", index=True)

In [7]:
test_df = pd.read_csv('data/test.csv', index_col='row_id')
test_df["label"] = test_df.apply(lambda r: labeler.get_label(r["x"], r["y"]), axis=1)
test_df.to_csv("data/test-labeled-12.csv", index=True)

#### Load the training data

In [2]:
train_df = pd.read_csv('data/train-labeled-12.csv', index_col='row_id', dtype={"place_id": str})

Introducing some helper methods

In [3]:
def extract_labels(label, min_label_size, max_label_size):
    return [label[:x] for x in range(min_label_size, max_label_size + 1)]

# Set these here to use throughout the script
min_label_size = 9
max_label_size = 9

# Example
extract_labels('dddbdbaaacab', min_label_size, max_label_size)

['dddbdbaaa']

In [4]:
def get_counts(items):
    counts = {}
    for i in items:
        if i in counts:
            counts[i] += 1
        else:
            counts[i] = 1
    return counts

# Example
get_counts(['a', 'b', 'a', 'c', 'd', 'b', 'e'])

{'a': 2, 'b': 2, 'c': 1, 'd': 1, 'e': 1}

#### Compute the TF-IDF score

Build the mapping of places to all the labels associated to them based on the known check-ins in that place.

In [5]:
places = {}
train_df.apply(lambda r: places.setdefault(r["place_id"],[]).extend(extract_labels(r["label"], min_label_size, max_label_size)), axis=1)
tf = {}
idf = {}

Build the TF dict, such that TF[label][place] is the frequency with which the label is associated to the place

In [6]:
for place in places:
    all_labels = places[place]
    counts = get_counts(all_labels)
    total = sum(counts.values())
    for label in counts:
        tf.setdefault(label, {})[place] = counts[label]/total

Build the IDF dict, such that IDF(label) is the log of the ratio of all documents to the number of documents containing the label

In [7]:
for label in tf:
    all_places_this_label = tf[label]
    idf[label] = math.log(len(places) / len(all_places_this_label))

Computing the TF-IDF score

In [8]:
def tf_idf(labels, tf, idf):
    scored_places = []
    for label in labels:
        for place in tf.get(label, {}):
            scored_places.append((place, tf[label][place]*idf[label]))
    grouped_places = {}
    for scored_place in scored_places:
        place = scored_place[0]
        score = scored_place[1]
        if place in grouped_places:
            grouped_places[place] += score
        else:
            grouped_places[place] = score
    return sorted(grouped_places.items(), key=lambda x:-x[1])

# Example from the data
tf_idf(['dddbacabc', 'dddbdacdabca'], tf, idf)[:3]

[('9670606219', 1.281309073019124),
 ('5029991103', 0.08542060486794159),
 ('1879541472', 0.06593871252963912)]

#### Make predictions for the test data

In [9]:
test_df = pd.read_csv('data/test-labeled-12.csv', index_col='row_id')

Compute the top 3 predictions per check-in:

In [10]:
def get_top_3(label):
    labels = extract_labels(label, min_label_size, max_label_size)
    top3 = tf_idf(labels, tf, idf)[:3]
    return ' '.join([x[0] for x in top3])
    
# Example
get_top_3('bddccadadcaa')

'1579952623 3329484070 4624089426'

In [11]:
test_df["place_id"] = test_df.apply(lambda r: get_top_3(r["label"]), axis=1)

Make submission

In [12]:
!rm submission.csv
test_df[["place_id"]].to_csv("submission.csv", index=True)