In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
train_df.shape

In [None]:
# ngram model
import math
from collections import Counter
from nltk.util import ngrams  # This is the ngram magic.

# calculate cos
def cosine_similarity_ngrams(a, b):
    if not a or not b:
        return 0.0
    vec1 = Counter(a)
    vec2 = Counter(b)

    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    return float(numerator) / denominator

# get ngram
def get_character_level_tuples_nosentences(text, ngram=6):
    if not text:
        return []
    text = text.lower()

    ng = ngrams(list(text), ngram)
    return list(ng)

# calculate score
def test_similarity(text_a, text_b, ngram=6):
    gram_a = get_character_level_tuples_nosentences(text_a, ngram)
    gram_b = get_character_level_tuples_nosentences(text_b, ngram)
    return cosine_similarity_ngrams(gram_a, gram_b)

# based on ngram，calculate score n times
def test_accum_similarity(text_a, text_b, low_ngram=4, high_ngram=10):
    score = 0
    for n in range(high_ngram, low_ngram-1, -1):  # from 10 to 4
        s = test_similarity(text_a, text_b, n)
        score = score * 10 + s
    return score

In [None]:
# get the best score's index
def get_score_index(raw_text, eval_text):
    score_list = [test_accum_similarity(text, eval_text) for text in raw_text]
    return sorted(list(enumerate(score_list)), key=lambda x:-x[1])[0][0]

In [None]:
from tqdm import tqdm
test_score = [0] * test_df.shape[0]
for i, eval_text in tqdm(enumerate(test_df['excerpt'])):
    ind = get_score_index(train_df['excerpt'], eval_text)
    test_score[i] = train_df['target'][ind]
test_score

In [None]:
test_df['target'] = test_score
test_df[['id','target']].to_csv('submission.csv', index=False)
test_df