In [1]:
import pandas as pd
import re
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)

In [2]:
!ls

fuzz  mdecpp  wsdm_diggsc


In [3]:
train_data = pd.read_csv("wsdm_diggsc/wsdm/data/train_release.csv")
valid_data = pd.read_csv("wsdm_diggsc/wsdm/data/validation.csv")
all_data = pd.read_csv("wsdm_diggsc/wsdm/data/candidate_paper_for_wsdm2020.csv", low_memory=False)

In [4]:
train_data['description_text_type'] = train_data['description_text'].map(lambda s: type(s))

In [5]:
train_data['description_text_type'].value_counts()

<class 'str'>      62975
<class 'float'>        1
Name: description_text_type, dtype: int64

In [6]:
train_data[train_data['description_text_type'] == float]

Unnamed: 0,description_id,paper_id,description_text,description_text_type
40889,,9082af,,<class 'float'>


In [7]:
train_data.drop(index=40889, inplace=True)

In [8]:
train_data['description_text_type'].value_counts()

<class 'str'>    62975
Name: description_text_type, dtype: int64

In [9]:
from pandarallel import pandarallel

In [10]:
pandarallel.initialize(nb_workers=32)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [11]:
def text_preprocessing(text):
    try:
        text = str(text)
        text = text.lower()  # lower
        text = text.strip()
        text = re.sub("\d+", "", text)
        text = re.sub("\[\[\*\*##\*\*\]\]", "", text)
        text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                      'url', text, flags=re.MULTILINE)  # replace https://... to url
        text = re.sub('[\u00a3-\ufb04]', "", text)
        text = text.split()
    except AttributeError as e:
        return []
    return text

In [12]:
train_data.head()

Unnamed: 0,description_id,paper_id,description_text,description_text_type
0,77bef2,5c0f7919da562944ac759a0f,Angiogenesis is reflected as newly formed vess...,<class 'str'>
1,42360e,5c1360beda56295a0896fda3,Cardiac fibrosis is a common process in remode...,<class 'str'>
2,9bf5e0,5d1b36e83a55ac0a0e8bb84e,"Agmatine, formed by the decarboxylation of L-a...",<class 'str'>
3,22e485,5d2709fd3a55ac2cfc28108f,The ob gene product leptin has been demonstrat...,<class 'str'>
4,30856c,55a392d1c91b587b095b6fcc,"Lauterbach M et al., have concluded at the end...",<class 'str'>


In [13]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838939 entries, 0 to 838938
Data columns (total 6 columns):
abstract    769687 non-null object
journal     838938 non-null object
keywords    61245 non-null object
paper_id    838938 non-null object
title       838939 non-null object
year        604119 non-null float64
dtypes: float64(1), object(5)
memory usage: 38.4+ MB


In [14]:
all_data.head()

Unnamed: 0,abstract,journal,keywords,paper_id,title,year
0,NO_CONTENT,Journal of economic entomology,,55a38b7f2401aa93797cef61,Anopheles stephensi: effect of gamma-radiation...,1978.0
1,Ecological and population biology issues const...,"Science (New York, N.Y.)",,55a4e85b612c6b12aafdb996,The ecology of genetically modified mosquitoes.,2002.0
2,"Thiotepa and its oxygen analogue tepa, used to...",Bulletin of the World Health Organization,,55a4cc5024011b361acddf6f,Persistence of thiotepa and tepa in pupae and ...,1972.0
3,NO_CONTENT,Journal of economic entomology,,55a56b4824012c2a39230188,Studies on the eradication of Anopheles pharoe...,1967.0
4,NO_CONTENT,Journal of the American Mosquito Control Assoc...,,55a483792401032899721ae4,A method of computing the effectiveness of an ...,1987.0


In [15]:
all_data[all_data['abstract'] == 'NO_CONTENT']

Unnamed: 0,abstract,journal,keywords,paper_id,title,year
0,NO_CONTENT,Journal of economic entomology,,55a38b7f2401aa93797cef61,Anopheles stephensi: effect of gamma-radiation...,1978.0
3,NO_CONTENT,Journal of economic entomology,,55a56b4824012c2a39230188,Studies on the eradication of Anopheles pharoe...,1967.0
4,NO_CONTENT,Journal of the American Mosquito Control Assoc...,,55a483792401032899721ae4,A method of computing the effectiveness of an ...,1987.0
5,NO_CONTENT,Journal of economic entomology,,55a5645424012c2a3920ecb7,Studies on the eradication of Anopheles pharoe...,1966.0
17,NO_CONTENT,The Journal of comparative neurology,,55a539e8612c6b12ab093cb4,A cytoarchitectonic atlas of the spinal cord i...,1954.0
...,...,...,...,...,...,...
604086,NO_CONTENT,The Journal of prosthetic dentistry,,5d285d4c3a55ac2bbd3a6745,Direct procedure for connecting a mandibular i...,2004.0
604088,NO_CONTENT,The Journal of prosthetic dentistry,,5d285d4c3a55ac2bbd3a6771,Adaptation of fiber-reinforced strip using den...,2004.0
604090,NO_CONTENT,The Journal of prosthetic dentistry,,5d74d84947c8f76646cf88bf,Fabricating an inexpensive leaf gauge deprogra...,2019.0
604097,NO_CONTENT,Human psychopharmacology,,55a4eb5a612c6b12aafe4a8a,Cholinergic modulation of cognitive function i...,2001.0


In [16]:
import Levenshtein

In [17]:
all_data['title'].value_counts()[:10]

Invited commentary.                       68
Letter to the editor.                     43
Letters to the editor.                    36
Commentary.                               27
Letters to the Editor.                    25
Introduction.                             25
Case for Diagnosis.                       21
From the Food and Drug Administration.    21
Letter to the Editor.                     18
Diagnostic and therapeutic challenges.    18
Name: title, dtype: int64

In [18]:
def abstract_title_combine(abstract_text, title):
    if abstract_text == 'NO_CONTENT':
        return title
    else:
        try:        
            abstract_text = abstract_text.strip()
            title = title.strip()
            if Levenshtein.distance(abstract_text, title) >= 50:  # 差异比较大
                return title + abstract_text
            else:
                if len(abstract_text.split()) <= 1:
                    return title
                return abstract_text
        except AttributeError:
            return title

In [19]:
all_data['title_abstract'] = all_data.parallel_apply(
        lambda s: abstract_title_combine(s['abstract'], s['title']), axis=1)

all_train_merge = pd.merge(train_data, all_data, on='paper_id')
print("all_train_merge length：", len(all_train_merge))


all_train_merge length： 62975


In [20]:
labeled_data = all_train_merge
labeled_data['title_abstract'] = labeled_data['title_abstract'].parallel_map(
    text_preprocessing)
labeled_data['description_text'] = labeled_data['description_text'].parallel_map(
    text_preprocessing)
labeled_data['description_text'].dropna(inplace=True)
labeled_data['title_abstract'].dropna(inplace=True)
labeled_data = labeled_data[[
    'title_abstract', 'description_text'
]]

In [21]:
labeled_data.columns

Index(['title_abstract', 'description_text'], dtype='object')

In [22]:
labeled_data['title_abstract_len'] = labeled_data['title_abstract'].map(len)
labeled_data['description_text_len'] = labeled_data['description_text'].map(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
labeled_data['title_abstract_len'].describe()

count    62975.000000
mean       157.883970
std         95.456928
min          1.000000
25%         63.000000
50%        170.000000
75%        229.000000
max       1294.000000
Name: title_abstract_len, dtype: float64

In [24]:
labeled_data['description_text_len'].describe()

count    62975.000000
mean       182.988440
std        119.845415
min          1.000000
25%        105.000000
50%        157.000000
75%        229.000000
max       1861.000000
Name: description_text_len, dtype: float64

In [25]:
labeled_data[labeled_data['description_text_len'] == 1].values

array([[list(['combination', 'of', 'lu-psma-', 'and', 'external', 'radiotherapy', 'for', 'the', 'treatment', 'of', 'cerebral', 'metastases', 'in', 'patients', 'with', 'castration-resistant', 'metastatic', 'prostate', 'cancer.two', 'castration-resistant', 'prostate', 'cancer', 'patients,', 'both', 'with', 'cerebral', 'and', 'visceral', 'and', 'lymphatic', 'metastases,', 'received', 'multiple', 'cycles', 'of', 'lu-psma-', 'treatments.', 'the', 'prognosis', 'of', 'both', 'cases', 'is', 'dependent', 'on', 'brain', 'metastases.', 'between', 'lu-psma-', 'treatment', 'cycles,', 'local', 'radiotherapy', 'was', 'also', 'applied', 'to', 'the', 'brain', 'metastases.', 'prior', 'to', 'the', 'combined', 'therapy,', 'all', 'systemic', 'metastases,', 'including', 'cerebral', 'lesions,', 'showed', 'psma', 'expression', 'using', 'ga-psma', 'pet/ct.', 'under', 'the', 'combined', 'therapy,', 'all', 'the', 'metastases,', 'particularly', 'the', 'cerebral', 'lesions,', 'showed', 'significant', 'regression',

In [26]:
labeled_data[labeled_data['title_abstract_len'] == 1]

Unnamed: 0,title_abstract,description_text,title_abstract_len,description_text_len
31761,[optogenetics.],"[the, term, optogenetics, indicates, a, method...",1,118
48169,[serotonin.],"[our, study, has, several, limitations,, mainl...",1,216
52319,[cholinesterases.],"[this, brief, review, focuses, on, unexpected,...",1,337
53857,[attention--grapefruit!],"[another, exception, must, be, made, for, the,...",1,130
55904,[lovesickness.],"[the, arbitrariness, by, which, eros, distribu...",1,191


In [27]:
string1 = 'dsfjksdjs'
string2 = 'dsfiksjsd'
Levenshtein.distance(string1, string2)

3