In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import sklearn
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
#import category_encoders as ce

import re
import nltk
import unicodedata
from nltk import PunktSentenceTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import transformers
import gensim.downloader

import xfeat
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import lightgbm as lgb
import gensim.downloader
from gensim.models import KeyedVectors

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
def concat_train_test(train, test):
    all_df = pd.merge(train, test, how="outer")
    all_df["data_type"] = ""
    for n in range(len(all_df)):
        all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"
    return all_df

In [4]:
all_df = concat_train_test(train, test)
all_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_df["data_type"][n] = "test" if np.isnan(all_df["state"][n]) else "train"


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train
...,...,...,...,...,...,...,...,...,...
21084,21084,9001-10000,US,30,food,drinks,"<div class=""contents""><div><p>Its time to get ...",,test
21085,21085,1-1000,US,29,food,small batch,"<div class=""contents""><div><p>I have been roas...",,test
21086,21086,1001-2000,US,27,crafts,pottery,"<div class=""contents""><div><p> I have ...",,test
21087,21087,2001-3000,US,30,design,graphic design,"<div class=""contents""><div><h1 class=""page-anc...",,test


In [5]:
def cleanup_spaces(html):
    sbd = PunktSentenceTokenizer()
    html_no_spaces = " ".join(sbd.sentences_from_text(html, realign_boundaries=True))
    return re.sub(r'\s+', " ", html_no_spaces)


def cleanup_japanese(html):
    return re.sub('[ぁ-ん ァ-ン 一-龥]', '', html)


def number_of_sentences(html):
    sbd = PunktSentenceTokenizer()
    return len(sbd.sentences_from_text(html, realign_boundaries=True))

In [6]:
def extract_each_paragraphs(html_text):
    paragraphs = []
    para_list = html_text.split("<p>")
    if not "<p>" in html_text[0:4]:
        para_list = para_list[1:]
    for para in para_list:
        if "</p>" in para:
            para_new = para.split("</p>")[0]
        else:
            para_new = ""
            
        #それでも残っているhtmlタグを取り除く(要らないかも)
        para_new = get_text(para_new)
        #para_new = remove_nobreak_space(para_new)
        
        paragraphs.append(para_new)
    try:
        paragraphs.remove("")
    except:
        pass
    try:
        paragraphs.remove(" ")
    except:
        pass
    
    return paragraphs

def number_of_chars(html):
    return len(" ".join(html.split()))

def number_of_words(html):
    return len(html.split())

def number_of_excmark(html):
    return html.count("!")

def number_of_questmark(html):
    return html.count("?")

def number_of_punctuation(html):
    return sum(html.count(w) for w in ".,;:")

def number_of_symbols(html):
    return sum(html.count(w) for w in "*$%&")

def number_of_unique_words(html):
    return len(set(w for w in html.split()))

In [7]:
def is_video_exist(n_video):
    is_video_exist = 1 if n_video != 0 else 0
    return is_video_exist

In [8]:
def compile_html_tag(html):
    html_tags = ["a", "div", "!", "span", "img", "button", "video", "figure",
                 "figcaption", "h1","h1 ", "h2", "h3", "h4", "h5", "h6", "polygon",
                 "iframe", "style", "svg", "use", "time", "source", "li",
                 "ul", "track", "embed", "input", "param",
                ]
    html = html.replace(u'\xa0', u' ')
    html = html.replace(u'\xc2', u' ')
    spaces = re.compile(r'\s+')
    html = spaces.sub(" ", html)
    url = re.compile(r'https?://\S+|www\.\S+')
    html = url.sub("<URL>", html)
    
    for tag in html_tags:
        compiler = re.compile(r"<{}.*?>".format(tag))
        html = compiler.sub(f"<{tag}>", html)
        
    return html

In [9]:
def extract_html_tag(html_text):
    tag_list = re.findall(r"<.*?>", html_text)
    return tag_list

In [10]:
def make_tag_set(df):
    tags = set()
    for html_text in df.html_content:
        compiled_html_text = compile_html_tag(html_text)
        tags |= set(extract_html_tag(compiled_html_text))
    return tags

def count_n_tag(html_tag):
    html, tag = html_tag
    return html.count(tag)

def add_number_of_tag(df_origin):
    df = df_origin.copy()
    tag_set = make_tag_set(df)
    for tag in tag_set:
        df[f"number_of_{tag}"] = list(map(count_n_tag, zip(df["html_compiled"], [tag]*len(df))))
    return df

In [11]:
ALL_TAGS = make_tag_set(all_df)
def get_text(html):
    html_tags = ALL_TAGS
    for tag in html_tags:
        html = re.sub(tag, "", html)
    html = cleanup_spaces(html)
    return html

In [12]:
def col_is_nonzero(col):
    return 0 if col == 0 else 1

In [None]:
def normalize(text):
    #text = cleanup_japanese(text)
    #text = cleanup_spaces(text)
    normalized_text = normalize_unicode(text)
    normalized_text = normalize_number(normalized_text)
    normalized_text = lower_text(normalized_text)
    return normalized_text


def lower_text(text):
    return text.lower()


def normalize_unicode(text, form='NFKC'):
    normalized_text = unicodedata.normalize(form, text)
    return normalized_text


def normalize_number(text):
    # 連続した数字を除去
    replaced_text = re.sub(r'\d+', '', text)
    return replaced_text

In [13]:
def apply_nlp_preprofuncs(df_origin):
    df = df_origin.copy()
    preprofuncs = [
        (compile_html_tag, "html_compiled", "html_content"),
        (get_text, "html_raw", "html_compiled"),
        (number_of_chars, "number_of_chars", "html_raw"),
        (number_of_words, "number_of_words", "html_raw"),
        (number_of_sentences, "number_of_sentences", "html_raw"),
        (number_of_excmark, "number_of_excmark", "html_raw"),
        (number_of_questmark, "number_of_questmark", "html_raw"),
        (number_of_punctuation, "number_of_punctuation", "html_raw"),
        (number_of_symbols, "number_of_symbols", "html_raw"),
        (number_of_unique_words, "number_of_unique_words", "html_raw"),
    ]
    for func, col_name, target in tqdm(preprofuncs):
        df[col_name] = list(map(func, df[target]))
    df = add_number_of_tag(df)
    
    df = df.drop(["html_content", "html_compiled", "html_raw"], axis=1)
    return df

In [14]:
def apply_nlp_preprofuncs_for_bert(df_origin):
    df = df_origin.copy()
    preprofuncs = [
        (compile_html_tag, "html_compiled", "html_content"),
        (get_text, "html_raw", "html_compiled"),
        (normalize, "normalized_html_raw", "html_raw")
    ]
    for func, col_name, target in tqdm(preprofuncs):
        df[col_name] = list(map(func, df[target]))
    
    return df

100%|██████████| 2/2 [00:19<00:00,  9.74s/it]


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,data_type,html_compiled,html_raw
0,0,4001-5000,CH,29,publishing,young adult,"<div class=""contents""><div><span class=""bold"">...",0.0,train,<div><div><span>Mark Saggia</span> is an Itali...,Mark Saggia is an Italian writer who emigrated...
1,1,3001-4000,NL,34,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0.0,train,"<div><div><h1>Hello, I am Augustinas. I am a g...","Hello, I am Augustinas. I am a graphic designe..."
2,2,19001-20000,US,30,food,spaces,"<div class=""contents""><div><p> As our society ...",0.0,train,<div><div><p> As our society begins to wake up...,As our society begins to wake up from the han...
3,3,2001-3000,US,41,technology,3d printing,"<div class=""contents""><div><p>My name is Donal...",0.0,train,<div><div><p>My name is Donald Osborne and I a...,My name is Donald Osborne and I am an entrepre...
4,4,2001-3000,GB,29,technology,diy electronics,"<div class=""contents""><div><div class=""templat...",1.0,train,<div><div><div> <figure> <img> </figure> </div...,"We all love to play, don't we! No matter the ..."


In [45]:
nlp_preprocessed_df = apply_nlp_preprofuncs(all_df)
text_type_df = apply_nlp_preprofuncs_for_bert(all_df)
text_type_df.head()

100%|██████████| 10/10 [00:26<00:00,  2.66s/it]


Unnamed: 0,id,goal,country,duration,category1,category2,state,data_type,number_of_chars,number_of_words,...,number_of_</figure>,number_of_<polygon>,number_of_<button>,number_of_<ul>,number_of_<track>,number_of_<!>,number_of_</span>,number_of_</i>,number_of_</iframe>,number_of_<svg>
0,0,4001-5000,CH,29,publishing,young adult,0.0,train,5289,961,...,6,0,0,0,0,0,18,0,0,0
1,1,3001-4000,NL,34,fashion,ready-to-wear,0.0,train,1144,202,...,15,0,0,0,0,0,0,0,0,0
2,2,19001-20000,US,30,food,spaces,0.0,train,3316,549,...,5,5,4,0,0,6,5,0,0,3
3,3,2001-3000,US,41,technology,3d printing,0.0,train,1670,293,...,0,0,0,0,0,0,0,0,0,0
4,4,2001-3000,GB,29,technology,diy electronics,1.0,train,7560,1211,...,67,45,36,1,0,54,66,0,0,27


In [15]:
nlp_preprocessed_df.to_csv('data/nlp_prepared_df.csv', index=False)
text_type_df.to_csv('data/nlp_prepared_bert.csv', index=False)

In [None]:
def make_tfidf_svd(df, target_col, stop_words="english", max_gram=2, num_features_tfidf=2000, num_features_svd=128):
    vectorizer = TfidfVectorizer(stop_words="english",
                                 ngram_range=(1,max_gram),
                                max_features=num_features_tfidf,
                                )
    transformer = TruncatedSVD(n_components=num_features_svd)
    
    X = vectorizer.fit_transform(df[target_col])
    matrix = transformer.fit_transform(X)
    
    columns = [f"tfidf_svd_{target_col}_{dim}" for dim in range(num_features_svd)]
    feature_df = pd.DataFrame(matrix, columns=columns)
    ids = [n for n in range(len(df))]
    feature_df["id"] = ids
    return feature_df

In [None]:
def extract_most_freq_word(df_origin):
    df = df_origin.copy()
    
    num_features = 1
    vectorizer = CountVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    mfw_dict = {}
    for n in range(len(df)):
        try:
            vectorizer.fit_transform([df.normalized_html_raw.iloc[n]])
            feature = vectorizer.get_feature_names()[0]
        except:
            # no text
            feature = ""
        mfw_dict[f"{n}"] = feature
    mfw_df = pd.DataFrame({"id": mfw_dict.keys(), "mfw": mfw_dict.values()})
    
    
    num_features = 2
    vectorizer = CountVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    sfw_dict = {}
    for n in range(len(df)):
        try:
            vectorizer.fit_transform([df.normalized_html_raw.iloc[n]])
            feature = list(vectorizer.get_feature_names())
            feature.remove(mfw_df["mfw"].iloc[n])
            feature = feature[0]
        except:
            # no text
            feature = ""
        sfw_dict[f"{n}"] = feature     
    mfw_df["sfw"] =  sfw_dict.values()
    
    return mfw_df

glove_short_vectors = gensim.downloader.load('glove-twitter-25')  # you can use any type of word2vec
def word_to_vector2(words):
    try:
        vector = glove_short_vectors[words[0]]
    except:
        try:
            vector = glove_short_vectors[words[1]]
        except:
            vector = np.zeros(25)
    return vector


def add_wordvec2(df_origin, columns):
    df = df_origin.copy()
    col1, col2 = columns
    df[[f"mfw_{n}" for n in range(25)]] = list(map(word_to_vector2, zip(df[col1], df[col2])))
    return df

In [None]:
def extract_most_informative_word(df_origin):
    df = df_origin.copy()
    
    num_features = 5000
    vectorizer = TfidfVectorizer(stop_words="english",
                                 ngram_range=(1,1),
                                max_features=num_features,)
    X = vectorizer.fit_transform(df.normalized_html_raw)
    matrix = vectorizer.transform(df.normalized_html_raw)
    feature_name = vectorizer.get_feature_names()
    
    miws = []
    ids = df["id"]
    for n in tqdm(range(len(df))):
        miw_column = np.argmax(matrix[n])
        miw = feature_name[miw_column]
        miws.append(miw)
    miw_df = pd.DataFrame({"id": ids, "miw": miws})
    return miw_df


def word_to_vector(words):
    try:
        vector = glove_short_vectors[words]
    except:
        print(words)
        vector = np.zeros(25)
    return vector


def add_wordvec(df_origin):
    df = df_origin.copy()
    df[[f"miw_{n}" for n in range(25)]] = list(map(word_to_vector, df["miw"]))
    df = df.drop("miw", axis=1)
    return df

In [None]:
mfw_df = extract_most_freq_word(text_type_df)
mfw_df = add_wordvec(mfw_df)
mfw_df.to_csv('data/mfw.csv', index=False)

In [None]:
miw_df = extract_most_informative_word(text_type_df)
miw_df = add_wordvec(miw_df)
miw_df.to_csv('data/miw.csv', index=False)
miw_df