## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from datetime import datetime

#### 1. To load the datasets

In [None]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [None]:
train_data.head()

#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel

In [29]:
# to get the attribute of the 'title', 'year/month/day/hour/minute/second', 'num_img', 'num_video', 'author name', 'topic', 'channel'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)
    date_string = soup.find('time')['datetime'].strip().lower()
    date = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z")

    year = date.year
    month = date.month
    day = date.day
    hour = date.hour
    minute = date.minute
    second = date.second

    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('img'))+len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topics_text = footer.get_text().split(': ')[1]
    topic = [topic.strip() for topic in topics_text.split(',')]

    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    return title, year, month, day, hour, minute, second, num_img, num_video, author_name, topic, channel

In [31]:
feature_list = []

for content in (train_data['Page content']):
    feature_list.append(preprocessor(content))

df = pd.DataFrame(
        feature_list, 
        columns=['title', 'year', 'month', 'day', 'hour' ,'minute', 'second', 'num_imgs', 'num_video', 'author name', 'topics', 'channel'])

df.head()

### 3. Preprocessing