## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [2]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from datetime import datetime
import nltk
from nltk.stem import WordNetLemmatizer

#### 1. To load the datasets

In [None]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [None]:
train_data.head()

#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel

In [29]:
# to get the attribute of the 'title', 'year/month/day/hour/minute/second', 'num_img', 'num_video', 'author name', 'topic', 'channel'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)
    date_string = soup.find('time')['datetime'].strip().lower()
    date = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z")

    year = date.year
    month = date.month
    day = date.day
    hour = date.hour
    minute = date.minute
    second = date.second

    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('img'))+len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topics_text = footer.get_text().split(': ')[1]
    topic = [topic.strip() for topic in topics_text.split(',')]

    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    return title, year, month, day, hour, minute, second, num_img, num_video, author_name, topic, channel

In [31]:
feature_list = []

for content in (train_data['Page content']):
    feature_list.append(preprocessor(content))

df = pd.DataFrame(
        feature_list, 
        columns=['title', 'year', 'month', 'day', 'hour' ,'minute', 'second', 'num_imgs', 'num_video', 'author name', 'topics', 'channel'])

df.head()

Unnamed: 0,title,year,month,day,hour,minute,second,num_imgs,num_video,author name,topics,channel
0,nasa's grand challenge: stop asteroids from de...,2013,6,19,15,4,30,1,1,clara moskowitz,"[Asteroid, Asteroids, challenge, Earth, Space,...",world
1,google's new open source patent pledge: we won...,2013,3,28,17,40,55,2,2,christina warren,"[Apps and Software, Google, open source, opn p...",tech
2,ballin': 2014 nfl draft picks get to choose th...,2014,5,7,19,15,20,2,27,sam laird,"[Entertainment, NFL, NFL Draft, Sports, Televi...",entertainment
3,cameraperson fails deliver slapstick laughs,2013,10,11,2,26,50,1,22,sam laird,"[Sports, Video, Videos, Watercooler]",watercooler
4,nfl star helps young fan prove friendship with...,2014,4,17,3,31,43,52,53,connor finnegan,"[Entertainment, instagram, instagram video, NF...",entertainment


### 3-1.  Preprocessing - tokenlization

* tokenlization: to split the text corpora into individual elements.

In [None]:
def tokenlization(text):
    return re.split('\s', text.strip())

### 3-2.  Preprocessing - word stemming

* word stemming:  a process that transforms words into their root forms and allows us to map related words to the same stem.

* There are 2 kinds of word stemming:
    1. PorterStemmer(Stemming): rule-based, will face the problem of overstemming
    2. WordNetLemmatizer(Lemmatization): more precise, but slower

In [None]:
def word_stemming(text):
    lm = WordNetLemmatizer()
    return [lm.stem(word) for word in re.spilt('\s', text.strip())]

### 3-3 Preprocessing - Stop words removal

刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文本分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及數據集的特性來進行決策。

In [None]:
def stop_words_removal(text):
    return

### 4. Model training
