In [147]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import pandas as pd

In [148]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,id,created_on,title,description,tag
0,6,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,computer-vision
1,7,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,computer-vision
2,9,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",other
3,15,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,other
4,25,2020-03-07 23:04:31,AttentionWalk,"A PyTorch Implementation of ""Watch Your Step: ...",other


### Feature Engineering

In [149]:
df['text'] = df['title'] + ' ' + df['description']
df['text'].head()

0    Comparison between YOLO and RCNN on real world...
1    Show, Infer & Tell: Contextual Inference for C...
2    Awesome Graph Classification A collection of i...
3    Awesome Monte Carlo Tree Search A curated list...
4    AttentionWalk A PyTorch Implementation of "Wat...
Name: text, dtype: object

In [150]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Cleaning

In [151]:
def clean_text(text, stopwords=STOPWORDS):
    """Clean raw text string."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)
 
    # Spacing and filters
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)  # add spacing
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text = re.sub(r"http\S+", "", text)  #  remove links

    return text

In [152]:
# Apply to dataframe

original_df = df.copy()
df.text = df.text.apply(clean_text)

print (f"Original text: {original_df.text.values[0]} \nCleaned text: {df.text.values[0]}")

Original text: Comparison between YOLO and RCNN on real world videos Bringing theory to experiment is cool. We can easily train models in colab and find the results in minutes. 
Cleaned text: comparison yolo rcnn real world videos bringing theory experiment cool easily train models colab find results minutes


In [153]:

df = df.drop(columns=['id', 'created_on', 'description', 'title'], errors='ignore')
df = df.dropna(subset=['tag'])
df = df[['text', 'tag']]
df.head()

Unnamed: 0,text,tag
0,comparison yolo rcnn real world videos bringin...,computer-vision
1,show infer tell contextual inference creative ...,computer-vision
2,awesome graph classification collection import...,other
3,awesome monte carlo tree search curated list m...,other
4,attentionwalk pytorch implementation watch ste...,other


### Encoding

In [154]:
tags = df.tag.unique().tolist()
text_to_idx = {tag:i for i, tag in enumerate(tags)}
text_to_idx

{'computer-vision': 0,
 'other': 1,
 'natural-language-processing': 2,
 'mlops': 3}

In [155]:
# Encode labels -> label encoding

df.tag = df.tag.map(text_to_idx)
df.head()

Unnamed: 0,text,tag
0,comparison yolo rcnn real world videos bringin...,0
1,show infer tell contextual inference creative ...,0
2,awesome graph classification collection import...,1
3,awesome monte carlo tree search curated list m...,1
4,attentionwalk pytorch implementation watch ste...,1


### Decode

In [156]:
def decode(indices, idx_to_label):
    return [idx_to_label[index] for index in indices]


In [157]:
idx_to_label = {k:v for v, k in text_to_idx.items()}
df.tag = decode(df.tag.values, idx_to_label=idx_to_label)
df.head()

Unnamed: 0,text,tag
0,comparison yolo rcnn real world videos bringin...,computer-vision
1,show infer tell contextual inference creative ...,computer-vision
2,awesome graph classification collection import...,other
3,awesome monte carlo tree search curated list m...,other
4,attentionwalk pytorch implementation watch ste...,other


### Tokenizer

In [158]:
import numpy as np
from transformers import BertTokenizer

In [159]:
# Bert tokenizer

tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
text = "Transfer learning with transformers for text classification."
encoded_inputs = tokenizer([text], return_tensors="np", padding="longest")  # pad to longest item in batch
print ("input_ids:", encoded_inputs["input_ids"])
print ("attention_mask:", encoded_inputs["attention_mask"])
print (tokenizer.decode(encoded_inputs["input_ids"][0]))

input_ids: [[  102  2268  1904   190 29155   168  3267  2998   205   103]]
attention_mask: [[1 1 1 1 1 1 1 1 1 1]]
[CLS] transfer learning with transformers for text classification. [SEP]


In [160]:
def tokenize(batch):
    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))

In [161]:
tokenize(df.head(1))

{'ids': array([[  102,  2029,  1778,   609,  6446,  4857,  1332,  2399, 13572,
         19125,  1983,  1954,  6240,  3717,  7434,  1262,   537,   201,
          1040,   545,  4714,   103]]),
 'masks': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'targets': array(['computer-vision'], dtype=object)}

### Perform all above operations inside a function

In [162]:
def preprocess(df, class_to_index):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df["text"] = df.text.apply(clean_text)  # clean text
    df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore")  # clean dataframe
    df = df.dropna(subset=['tag']) # drop na  from tags
    df = df[["text", "tag"]]  # rearrange columns
    df["tag"] = df["tag"].map(class_to_index)  # label encoding
    outputs = tokenize(df)
    return outputs

In [163]:
preprocess(original_df.head(1), class_to_index= text_to_idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df.title + " " + df.description  # feature engineering
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df.text.apply(clean_text)  # clean text


{'ids': array([[  102,  2029,  1778,   609,  6446,  4857,  1332,  2399, 13572,
         19125,  1983,  1954,  6240,  3717,  7434,  1262,   537,   201,
          1040,   545,  4714,   103]]),
 'masks': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'targets': array([0], dtype=int64)}