In [22]:
import pandas as pd
import numpy as np
import tqdm
from matplotlib import pyplot as plt
import nltk

In [104]:
data = pd.read_csv("data/train.csv").set_index("text_id")
data

Unnamed: 0_level_0,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...
FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


In [107]:
from collections import defaultdict

# TODO: add word corrections: triying -> trying
class TermFrequencyFeaturizer:
    MAX_TERM_FREQ = 23135751162
    
    def __init__(self):
        term_frequencies_dataset = self._load_term_frequency_dataset()
        term2freq = defaultdict(lambda: 0)
        term2freq.update(
            {word: freq for word, freq in zip(list(term_frequencies_dataset.word), list(term_frequencies_dataset['count']))}
        )
        self.term2freq: dict[str, int] = term2freq
            
    def _load_term_frequency_dataset(self) -> pd.DataFrame:
        try:
            term_frequencies_dataset = pd.read_csv("aux_data/unigram_freq.csv")
        except IOException:
            raise Exception("Guys, you need `unigram_freq.csv` dataset in aux_data/ folder")
        return term_frequencies_dataset
        
    def featurize(self, texts: pd.Series, n_bins: int = 20) -> pd.DataFrame:
        term_frequencies = texts.apply(self._compute_term_frequencies_from_text)
        min_bin = 0
        max_bin = np.log1p(MAX_TERM_FREQ)
        bins = np.linspace(min_bin, max_bin, n_bins)
        feature_names = [f"bin_{round(bins[i], 1)}_{round(bins[i+1], 1)}" for i in range(len(bins)-1)]
        feature_values = []
        for i, word_frequencies in enumerate(tqdm.tqdm(term_frequencies.values)):
            word_frequencies_log = np.log1p(word_frequencies)
            term_frequencies_histogram_values, __ = np.histogram(word_frequencies_log, bins=bins)
            normalized_term_frequencies_histogram_values = term_frequencies_histogram_values / len(word_frequencies)
            feature_values.append(normalized_term_frequencies_histogram_values)
        feature_values = np.array(feature_values)
        feature_df = pd.DataFrame(feature_values, columns=feature_names, index=texts.index)
        return feature_df

    def _compute_term_frequencies_from_text(self, text: str) -> list[int]:
        MAX_TERM_FREQ = 23135851162
        tokens = nltk.tokenize.word_tokenize(text)
        words = [token.lower() for token in tokens if token.isalpha()]
        word_frequencies = [term2freq[word] for word in words]
        return word_frequencies

In [108]:
featurizer = TermFrequencyFeaturizer()
featurizer.featurize(data.full_text[:10])

100%|████████████████████████████████████████| 10/10 [00:00<00:00, 10320.63it/s]


Unnamed: 0_level_0,bin_0.0_1.3,bin_1.3_2.5,bin_2.5_3.8,bin_3.8_5.0,bin_5.0_6.3,bin_6.3_7.5,bin_7.5_8.8,bin_8.8_10.0,bin_10.0_11.3,bin_11.3_12.6,bin_12.6_13.8,bin_13.8_15.1,bin_15.1_16.3,bin_16.3_17.6,bin_17.6_18.8,bin_18.8_20.1,bin_20.1_21.4,bin_21.4_22.6,bin_22.6_23.9
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0016926B079C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003817,0.019084,0.038168,0.09542,0.122137,0.209924,0.198473,0.160305,0.152672
0022683E9EA5,0.001876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001876,0.003752,0.011257,0.013133,0.031895,0.118199,0.230769,0.183865,0.225141,0.178236
00299B378633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028125,0.025,0.175,0.234375,0.265625,0.140625,0.13125
003885A45F42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004121,0.017857,0.026099,0.049451,0.179945,0.190934,0.175824,0.211538,0.144231
0049B1DF5CCC,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.004274,0.0,0.004274,0.004274,0.008547,0.034188,0.025641,0.17094,0.188034,0.17094,0.196581,0.153846
004AC288D833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003058,0.003058,0.04893,0.119266,0.324159,0.253823,0.103976,0.143731
005661280443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046404,0.027842,0.048724,0.187935,0.12297,0.162413,0.262181,0.141531
008DDDDD8E8D,0.031008,0.0,0.0,0.0,0.0,0.0,0.0,0.005168,0.007752,0.01292,0.005168,0.005168,0.007752,0.023256,0.108527,0.232558,0.175711,0.113695,0.271318
009BCCC61C2A,0.012216,0.0,0.0,0.0,0.0,0.0,0.0,0.001745,0.001745,0.0,0.008726,0.012216,0.057592,0.078534,0.17103,0.144852,0.162304,0.219895,0.129145
009F4E9310CB,0.002532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002532,0.002532,0.010127,0.025316,0.078481,0.192405,0.182278,0.217722,0.124051,0.162025


In [109]:
import catboost

ModuleNotFoundError: No module named 'catboost'