In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import sklearn
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, PCA
import lightgbm as lgb
import re
import nltk
from nltk import PunktSentenceTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import transformers
import gensim.downloader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
def concat_train_test(train, test):
    all_df = pd.merge(train, test, how="outer")
    all_df["data_type"] = ""
    for n in range(len(all_df)):
        all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"
    return all_df

In [4]:
all_df = concat_train_test(train, test)
all_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train
...,...,...,...,...,...,...,...,...,...
21084,21084,9001-10000,US,30,food,drinks,"<div class=""contents""><div><p>Its time to get ...",,test
21085,21085,1-1000,US,29,food,small batch,"<div class=""contents""><div><p>I have been roas...",,test
21086,21086,1001-2000,US,27,crafts,pottery,"<div class=""contents""><div><p> I have ...",,test
21087,21087,2001-3000,US,30,design,graphic design,"<div class=""contents""><div><h1 class=""page-anc...",,test


In [5]:
def cleanup_spaces(html):
    sbd = PunktSentenceTokenizer()
    html_no_spaces = " ".join(sbd.sentences_from_text(html, realign_boundaries=True))
    return re.sub(r'\s+', " ", html_no_spaces)

def cleanup_japanese(html):
    return re.sub('[ぁ-ん ァ-ン 一-龥]', '', html)

def number_of_sentence(html):
    sbd = PunktSentenceTokenizer()
    return len(sbd.sentences_from_text(html, realign_boundaries=True))

def add_number_of_sentence(df_origin):
    df = df_origin.copy()
    df["number_of_sentence"] = list(map(number_of_sentence, df["html_raw"]))
    return df

def number_of_char(html):
    return len(html)

In [6]:
def bs_get_text(html):
    res = str(BeautifulSoup(html).get_text())
    if not res:
        res = ""
    return res


def extract_each_paragraphs(html_text):
    paragraphs = []
    para_list = html_text.split("<p>")
    if not "<p>" in html_text[0:4]:
        para_list = para_list[1:]
    for para in para_list:
        if "</p>" in para:
            para_new = para.split("</p>")[0]
        else:
            para_new = ""
            
        #それでも残っているhtmlタグを取り除く(要らないかも)
        para_new = get_text(para_new)
        #para_new = remove_nobreak_space(para_new)
        
        paragraphs.append(para_new)
    try:
        paragraphs.remove("")
    except:
        pass
    try:
        paragraphs.remove(" ")
    except:
        pass
    
    return paragraphs


from transformers import BertTokenizer, BertForSequenceClassification
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def count_number_of_words(html_text):
    text = get_text(html_text)
    token = bert_tokenizer.encode_plus(text)
    ids = token['input_ids']
    return len(ids)-2
    
    
def add_html_raw(df_origin):
    df = df_origin.copy()
    df["html_raw"] = list(map(bs_get_text, df["html_compiled"]))
    return df


def add_number_of_word(df_origin):
    df = df_origin.copy()
    df["number_of_word"] = list(map(count_number_of_words, df["html_compiled"]))
    return df

In [7]:
def is_video_exist(n_video):
    is_video_exist = 1 if n_video != 0 else 0
    return is_video_exist

def improve_n_video(df_origin):
    df = df_origin.copy()
    df["video"] = list(map(is_video_exist, df["number_of_<video>"]))
    return df

In [8]:
def select_paragraphs(para_lis):
    para_len_lis = []
    for para in para_lis:
        n_word = count_number_of_words(para)
        para_len_lis.append(n_word)
    length = sum(para_len_lis)
    while length > 512:
        para_len_lis = para_len_lis[1:]
        length = sum(para_len_lis)
    n_paragraph = len(para_len_lis)
    selected_para = para_lis[-n_paragraph:]
    return selected_para

def choose_para(html):
    para_list = extract_each_paragraphs(html)
    selected_para_list = select_paragraphs(para_list)
    sentence = " ".join(selected_para_list)
    return sentence

def add_each_paragraphs(df_origin):
    df = df_origin.copy()
    df["selected_paragraphs"] = list(map(choose_para, df["html_compiled"]))
    return df

In [9]:
def columnA_pre_columnB(A_B):
    columnA, columnB = A_B
    if columnB == 0:
        return np.inf
    else:
        return columnA / columnB
    
def add_AperB(df_origin, columns):
    """
    columns : tuple
    columnA,B,C: str
    """
    df = df_origin.copy()
    columnA, columnB = columns
    columnC = f'{columnA.split("_")[-1]}_per_{columnB.split("_")[-1]}'
    df[columnC] = list(map(columnA_pre_columnB, zip(df[columnA], df[columnB])))
    return df

In [10]:
def compile_html_tag(html):
    html_tags = ["a", "div", "!", "span", "img", "button", "video", "figure",
                 "figcaption", "h1","h1 ", "h2", "h3", "h4", "h5", "h6", "polygon",
                 "iframe", "style", "svg", "use", "time", "source", "li",
                 "ul", "track", "embed", "input", "param",
                ]
    html = html.replace(u'\xa0', u' ')
    html = html.replace(u'\xc2', u' ')
    spaces = re.compile(r'\s+')
    html = spaces.sub(" ", html)
    url = re.compile(r'https?://\S+|www\.\S+')
    html = url.sub("<URL>", html)
    
    for tag in html_tags:
        compiler = re.compile(r"<{}.*?>".format(tag))
        html = compiler.sub(f"<{tag}>", html)
        
    return html

In [11]:
def extract_html_tag(html_text):
    tag_list = re.findall(r"<.*?>", html_text)
    return tag_list

def extract_and_join_tag(html):
    tag_list = extract_html_tag(html)
    tag_only_content = " ".join(tag_list)
    return tag_only_content

In [12]:
def make_tag_set(df):
    tags = set()
    for html_text in df.html_content:
        compiled_html_text = compile_html_tag(html_text)
        tags |= set(extract_html_tag(compiled_html_text))
    return tags

def count_n_tag(html_tag):
    html, tag = html_tag
    return html.count(tag)

In [13]:
ALL_TAGS = make_tag_set(all_df)
def get_text(html):
    html_tags = ALL_TAGS
    for tag in html_tags:
        html = re.sub(tag, "", html)
    html = cleanup_spaces(html)
    return html

def add_html_raw_text(df_origin):
    df = df_origin.copy()
    df["html_raw"] = list(map(get_text, df["html_compiled"]))
    return df

In [14]:
def add_number_of_tag(df_origin):
    df = df_origin.copy()
    tag_set = make_tag_set(df)
    for tag in tag_set:
        df[f"number_of_{tag}"] = list(map(count_n_tag, zip(df["html_compiled"], [tag]*len(df))))
    return df

In [15]:
def add_compiled_html(df_origin):
    df = df_origin.copy()
    df["html_compiled"] = list(map(compile_html_tag, df["html_content"]))
    return df

def add_tag_only(df_origin):
    df = df_origin.copy()
    df["tag_only"] = list(map(extract_and_join_tag, df["html_compiled"]))
    return df

In [16]:
def make_pair(candidates):
    pairs = []
    N = len(candidates)
    for n, first in enumerate(candidates):
        for k in range(n+1,N):
            second = candidates[k]
            pairs.append([first, second])
    return pairs

In [17]:
def nlp_preprocess(df_origin):
    df = df_origin.copy()
    funcs = [add_compiled_html,
            add_html_raw_text,
            #add_each_paragraphs,
            #add_tag_only,
            #add_number_of_tag,
            #add_number_of_word,
            #add_number_of_sentence,
            ]
    for func in tqdm(funcs):
        df = func(df)
    return df



all_new = nlp_preprocess(all_df)
#all_new.to_csv('data/all_nlp_preprocessed.csv', index=False)
all_new.head()

100%|██████████| 2/2 [00:17<00:00,  8.74s/it]


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type,html_compiled,html_raw
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train,<div><div><span>Mark Saggia</span> is an Itali...,Mark Saggia is an Italian writer who emigrated...
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train,"<div><div><h1>Hello, I am Augustinas. I am a g...","Hello, I am Augustinas. I am a graphic designe..."
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train,<div><div><p> As our society begins to wake up...,As our society begins to wake up from the han...
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train,<div><div><p>My name is Donald Osborne and I a...,My name is Donald Osborne and I am an entrepre...
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train,<div><div><div> <figure> <img> </figure> </div...,"We all love to play, don't we! No matter the ..."


In [18]:
def make_tfidf_svd(df, target_col, stop_words="english", max_gram=2, num_features_tfidf=2000, num_features_svd=128):
    vectorizer = TfidfVectorizer(stop_words="english",
                                 ngram_range=(1,max_gram),
                                max_features=num_features_tfidf,
                                )
    transformer = TruncatedSVD(n_components=num_features_svd)
    
    X = vectorizer.fit_transform(df[target_col])
    matrix = transformer.fit_transform(X)
    
    columns = [f"tfidf_svd_{target_col}_{dim}" for dim in range(num_features_svd)]
    feature_df = pd.DataFrame(matrix, columns=columns)
    ids = [n for n in range(len(df))]
    feature_df["id"] = ids
    return feature_df

In [19]:
#tfidf_svd_raw_64 = make_tfidf_svd(all_new, "html_raw", stop_words="english", max_gram=1, num_features_tfidf=1000, num_features_svd=64)
tfidf_svd_raw_64_2gram = make_tfidf_svd(all_new, "html_raw", stop_words="english", max_gram=2, num_features_tfidf=None, num_features_svd=128)

In [20]:
tfidf_svd_raw_64_2gram.to_csv("data/tfidf_svd_raw_128_2gram.csv", index=False)

In [None]:
def make_tfidf(df_origin, num_features):
    df = df_origin.copy()
    Ngram = 1 if num_features <= 1000 else 2
    vectorizer = TfidfVectorizer(stop_words="english",
                                 ngram_range=(1,Ngram),
                                max_features=num_features,
                                )
    X = vectorizer.fit_transform(df.html_raw)
    matrix = vectorizer.transform(df.html_raw)
    columns = [f"tfidf_{word}" for word in vectorizer.get_feature_names()]
    feature_df = pd.DataFrame.sparse.from_spmatrix(data=matrix, columns=columns)
    ids = [n for n in range(len(df))]
    feature_df["id"] = ids
    return feature_df

In [96]:
tfidf_1000 = make_tfidf(all_new, 1000)
tfidf_1000.to_csv('data/tfidf_1000.csv', index=False)

In [97]:
tfidf_1000.head()

Unnamed: 0,tfidf_00,tfidf_000,tfidf_10,tfidf_100,tfidf_11,tfidf_12,tfidf_13,tfidf_14,tfidf_15,tfidf_16,...,tfidf_york,tfidf_young,tfidf_youth,tfidf_youtube,tfidf_このコンテンツを表示するにはhtml5対応のブラウザが必要です,tfidf_再生,tfidf_動画を再生,tfidf_音ありでリプレイ,tfidf_音声ありで,id
0,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.097765,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1
2,0.239666,0.000000,0.144805,0.0,0.00000,0.0,0.0,0.000000,0.059710,0.0,...,0.0,0.000000,0.0,0.0,0.065161,0.065161,0.065161,0.065161,0.065161,2
3,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,3
4,0.566628,0.026780,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.154056,0.154056,0.154056,0.154056,0.154056,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21084,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,21084
21085,0.000000,0.000000,0.126209,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,21085
21086,0.000000,0.073088,0.000000,0.0,0.09915,0.0,0.0,0.106112,0.085617,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,21086
21087,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,21087


In [22]:
import unicodedata
#from nltk.stem.porter import PorterStemmer
import nltk
#from nltk.corpus import wordnet


def normalize(text):
    #text = cleanup_japanese(text)
    #text = cleanup_spaces(text)
    normalized_text = normalize_unicode(text)
    normalized_text = normalize_number(normalized_text)
    normalized_text = lower_text(normalized_text)
    return normalized_text


def lower_text(text):
    return text.lower()


def normalize_unicode(text, form='NFKC'):
    normalized_text = unicodedata.normalize(form, text)
    return normalized_text


def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)


def normalize_number(text):
    # 連続した数字を除去
    replaced_text = re.sub(r'\d+', '', text)
    return replaced_text

In [23]:
def extract_paragraphs_content(html):
    return " ".join(extract_each_paragraphs(html))

def add_paragraphs_content(df_origin):
    df = df_origin.copy()
    df["paragraph's_content"] = list(map(extract_paragraphs_content, df["html_compiled"]))
    return df

def add_normalized_html(df_origin):
    df = df_origin.copy()
    #df["normalized_html_raw"] = list(map(normalize, df["paragraph's_content"]))
    df["normalized_html_raw"] = list(map(normalize, df["html_raw"]))
    return df

In [24]:
all_df_new = all_df.copy()
funcs = [add_compiled_html,
         add_html_raw_text,
         add_paragraphs_content,
        add_normalized_html,]
for func in tqdm(funcs):
    all_df_new = func(all_df_new)

100%|██████████| 4/4 [00:34<00:00,  8.73s/it]


In [25]:
all_df_new.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,html_compiled,html_raw,paragraph's_content,normalized_html_raw
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,<div><div><span>Mark Saggia</span> is an Itali...,Mark Saggia is an Italian writer who emigrated...,"He is a Nerd, Star Wars, Marvel Comics and vid...",mark saggia is an italian writer who emigrated...
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,"<div><div><h1>Hello, I am Augustinas. I am a g...","Hello, I am Augustinas. I am a graphic designe...",I think that street fashion is not haute coutu...,"hello, i am augustinas. i am a graphic designe..."
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,<div><div><p> As our society begins to wake up...,As our society begins to wake up from the han...,As our society begins to wake up from the han...,as our society begins to wake up from the han...
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,<div><div><p>My name is Donald Osborne and I a...,My name is Donald Osborne and I am an entrepre...,My name is Donald Osborne and I am an entrepre...,my name is donald osborne and i am an entrepre...
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,<div><div><div> <figure> <img> </figure> </div...,"We all love to play, don't we! No matter the ...","We all love to play, don't we! No matter the a...","we all love to play, don't we! no matter the ..."


In [26]:
glove_short_vectors = gensim.downloader.load('glove-twitter-25')

In [27]:
def extract_most_freq_word(df_origin):
    df = df_origin.copy()
    
    num_features = 1
    vectorizer = CountVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    mfw_dict = {}
    for n in range(len(df)):
        try:
            vectorizer.fit_transform([df.normalized_html_raw.iloc[n]])
            feature = vectorizer.get_feature_names()[0]
        except:
            # no text
            feature = ""
        mfw_dict[f"{n}"] = feature
    mfw_df = pd.DataFrame({"id": mfw_dict.keys(), "mfw": mfw_dict.values()})
    
    
    num_features = 2
    vectorizer = CountVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    sfw_dict = {}
    for n in range(len(df)):
        try:
            vectorizer.fit_transform([df.normalized_html_raw.iloc[n]])
            feature = list(vectorizer.get_feature_names())
            feature.remove(mfw_df["mfw"].iloc[n])
            feature = feature[0]
        except:
            # no text
            feature = ""
        sfw_dict[f"{n}"] = feature     
    mfw_df["sfw"] =  sfw_dict.values()
    
    return mfw_df


def word_to_vector2(words):
    try:
        vector = glove_short_vectors[words[0]]
    except:
        try:
            vector = glove_short_vectors[words[1]]
        except:
            vector = np.zeros(25)
    return vector

def add_wordvec2(df_origin, columns):
    df = df_origin.copy()
    col1, col2 = columns
    df[[f"mfw_{n}" for n in range(25)]] = list(map(word_to_vector2, zip(df[col1], df[col2])))
    return df

In [255]:
mfw_df = extract_most_freq_word(all_df_new)
mfw_df.head(10)

Unnamed: 0,id,mfw,sfw
0,0,game,book
1,1,people,street
2,2,complete,aquaponics
3,3,products,business
4,4,bit,micro
5,5,photos,like
6,6,luxury,design
7,7,cuff,wrist
8,8,graphic,borge
9,9,residence,film


In [257]:
mfw_df = add_wordvec(mfw_df)
mfw_df.to_csv('data/mfw.csv', index=False)

In [36]:
def extract_most_informative_word(df_origin):
    df = df_origin.copy()
    
    num_features = 5000
    vectorizer = TfidfVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    X = vectorizer.fit_transform(df.normalized_html_raw)
    matrix = vectorizer.transform(df.normalized_html_raw)
    feature_name = vectorizer.get_feature_names()
    
    miws = []
    ids = df["id"]
    for n in tqdm(range(len(df))):
        miw_column = np.argmax(matrix[n])
        miw = feature_name[miw_column]
        miws.append(miw)
    miw_df = pd.DataFrame({"id": ids, "miw": miws})
    return miw_df

def word_to_vector(words):
    try:
        vector = glove_short_vectors[words]
    except:
        print(words)
        vector = np.zeros(25)
    return vector

def add_wordvec(df_origin):
    df = df_origin.copy()
    df[[f"miw_{n}" for n in range(25)]] = list(map(word_to_vector, df["miw"]))
    df = df.drop("miw", axis=1)
    return df

In [37]:
miw_df = extract_most_informative_word(all_df_new)
miw_df = add_wordvec(miw_df)
miw_df

100%|██████████| 21089/21089 [00:03<00:00, 6875.24it/s]


このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを表示するにはhtml対応のブラウザが必要です
このコンテンツを

Unnamed: 0,id,miw_0,miw_1,miw_2,miw_3,miw_4,miw_5,miw_6,miw_7,miw_8,...,miw_15,miw_16,miw_17,miw_18,miw_19,miw_20,miw_21,miw_22,miw_23,miw_24
0,0,1.14600,0.329100,0.268780,-1.394500,-0.300440,0.77901,1.35370,0.373930,0.504780,...,0.102870,-0.17618,-1.288100,-0.598010,0.261310,-1.261900,0.392020,0.593090,-0.552320,0.005087
1,1,-0.73473,0.120720,0.607640,-0.504740,-0.057365,0.52869,0.59001,-0.237530,1.140600,...,-0.034995,0.10252,0.021756,-0.308370,0.430040,0.002524,-0.504040,0.230500,-1.449700,0.589240
2,2,0.07785,0.065545,-0.349580,-0.378050,0.873780,0.24333,1.49570,-0.562600,0.070204,...,0.355000,0.16465,0.320060,-0.045544,-0.319110,-1.013800,-0.054359,0.200740,0.320610,-0.455840
3,3,0.70687,0.277370,0.291350,-0.326550,0.538730,-0.65520,1.35720,-0.252000,-0.316420,...,-0.149920,0.71925,-0.162080,0.623380,0.380480,-0.045843,0.478960,-0.523910,-0.798070,0.278350
4,4,0.30963,-0.206810,-0.041067,-0.149900,0.843800,0.64058,-0.54087,0.055304,1.194000,...,0.394530,0.79590,0.501160,0.056856,1.026600,-0.793950,-0.804790,1.111700,0.013755,-0.244400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10539,21084,-0.52575,-0.229000,0.317210,1.120200,-0.113580,0.43399,0.94626,-1.406300,0.277020,...,-0.432380,1.08140,-0.507060,0.429500,0.373860,-0.675640,0.441460,0.082967,1.074900,0.846170
10540,21085,-1.45320,0.255940,0.081679,0.490810,-0.059128,0.63656,1.45660,-1.060700,0.479030,...,-0.509060,1.24320,-0.541210,0.193750,0.357030,-0.988500,-0.693350,-0.339980,0.718270,1.070400
10541,21086,-0.49515,1.283500,-1.236000,0.708940,0.295160,-0.48593,0.55181,-1.055300,-0.291370,...,0.068498,0.51722,-0.204610,0.794370,-0.696780,-0.945340,-0.336830,-0.805960,0.542580,-0.736610
10542,21087,0.85912,-0.573080,1.341200,0.021117,0.516680,-0.37890,0.11455,-0.635790,-0.581480,...,1.098900,0.28815,-0.689680,0.233150,0.089295,-0.993910,-0.822220,-0.341300,-0.357680,-0.550620


In [39]:
miw_df.to_csv('data/miw.csv', index=False)

In [20]:
def make_minimal_tfidf(df_origin, num_features):
    df = df_origin.copy()
    Ngram = 1 if num_features <= 1000 else 2
    vectorizer = TfidfVectorizer(
                                 ngram_range=(1,Ngram),
                                max_features=num_features,
                                )
    ids = [n for n in range(len(df))]
    minimal_tfidf = pd.DataFrame({"id": ids})
    for category2 in df.category2.unique():
        mini_df = df[df["category2"] == category2]
        X = vectorizer.fit_transform(mini_df.html_raw)
        matrix = vectorizer.transform(df.html_raw)
        columns = [f"tfidf_{category2}_{n}" for n in range(num_features)]
        feature_df = pd.DataFrame.sparse.from_spmatrix(data=matrix, columns=columns)
        ids = mini_df["id"]
        feature_df["id"] = ids
        
        minimal_tfidf = pd.merge(minimal_tfidf, feature_df, on="id", how="outer")
    return feature_df