# text's feature engineering: turn unstructure to  structure

In [None]:
import re
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part1_2.zip
!unzip -q NLP_part1_2.zip

In [None]:
df = pd.read_csv('Data/article_preprocessed.csv')

In [None]:
# load 'article_cutted'
with open("Data/article_cutted", "rb") as file:
    sentences = pickle.load(file)

## define y (push > boo)

In [None]:
# drop data
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()

In [None]:
# define y
df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df = df.reset_index(drop=True)

In [None]:
df['type'].value_counts()

## simple feature

In [None]:
# word count
# http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)
df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\u4e00-\u9fff]')

In [None]:
# punctuation count
df['punctuation'] = df['content'].str.replace('[\w\s]', '')
df['punctuation_count'] = df['punctuation'].str.len()

In [None]:
# question mark count
df['question_count'] = df['punctuation'].str.count('[?？]')

In [None]:
# drop punctuation column
df = df.drop(['punctuation'],axis=1)

In [None]:
df.iloc[:5, -4:]

In [None]:
# compute correlation
df.iloc[:, -4:].corr()

## bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# define transformer (轉換器)
vectorizer = CountVectorizer()
count = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [None]:
count

In [None]:
# save data as pickle format
with open("Data/article_count", "wb") as file:
    pickle.dump([vectorizer, count], file)

### select top 10 frequency of words

In [None]:
# create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [None]:
# columnwise sum: words frequency
sum_ = np.array(count.sum(axis=0))[0]

In [None]:
# top 10 frequency's wordID
most_sum_id = sum_.argsort()[::-1][:10].tolist()
most_sum_id

In [None]:
# print top 10 frequency's words
features = [id2word[i] for i in most_sum_id]
features

In [None]:
# print the data
data = pd.DataFrame(count[df.idx,:][:,most_sum_id].toarray(), columns=features)
data[:5]

In [None]:
df.iloc[0].content

In [None]:
# compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# define transformer (轉換器)
vectorizer = TfidfVectorizer(norm=None) ## do not do normalize
tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [None]:
# save data as pickle format
with open("Data/article_tfidf", "wb") as file:
    pickle.dump([vectorizer, tfidf], file)

### select top 10 average tf-idf of words

In [None]:
# create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [None]:
# columnwise average: words tf-idf
avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)

# set df < 20 as 0
avg[(tfidf!=0).sum(axis=0)<20] = 0

In [None]:
avg = np.array(avg)[0]

In [None]:
# top 10 tfidf's wordID
most_avg_id = avg.argsort()[::-1][:10].tolist()
most_avg_id

In [None]:
# print top 10 tf-idf's words
features = [id2word[i] for i in most_avg_id]
features

In [None]:
# print the data
data = pd.DataFrame(tfidf[df.idx,:][:,most_avg_id].toarray(), columns=features)
data[:5]

In [None]:
# compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()