# Feature Extraction
- Read data
- Preprocess
- One-hot encoding
- TF-IDF
- n-grams

### Read data

In [117]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('translated_data.csv')
X_trans = df.to_numpy()
X_trans = X_trans.flatten()

# Read target data
import json_lines
yz = []
with open('data.txt', 'rb') as f:
    for item in json_lines.reader(f):
        yz.append([item['voted_up'], item['early_access']])
        
X_train, X_test, yz_train, yz_test = train_test_split(X_trans, yz, test_size=0.3)

yz_train_split = np.hsplit(np.array(yz_train), 2)
y_train = yz_train_split[0]
z_train = yz_train_split[1]

yz_test_split = np.hsplit(np.array(yz_test), 2)
y_test = yz_test_split[0]
z_test = yz_test_split[1]

np.save('./features/y_train', y_train)
np.save('./features/y_test', y_test)

np.save('./features/z_train', z_train)
np.save('./features/z_test', z_test)

### Preprocess

In [104]:
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

def preprocess(text_data):
    # custom punctuation filter does not include ':', '(' and ')' for emojis
    PUNCTUATION = '!"#$%&\'*+,-./;<=>?@[\\]^_`{|}~'

    def remove_punc(s):
        return "".join([char for char in s if char not in PUNCTUATION])

    def remove_stopwords(tokens):
        stop_words = stopwords.words('english')
        stop_words.remove('very')
        stop_words.remove('not')
        return [word for word in tokens if word not in stop_words]

    def stem(tokens):
        porter = PorterStemmer()
        return [porter.stem(word) for word in tokens]
    
    def encode_emojis(s):
        s = re.sub(r'♥+', 'profanity', s)
        s = s.replace(':)', 'smiley')
        s = s.replace(':(', 'frowney')
        s = s.replace('<3', 'heart')
        return s

    # to lowercase
    low = list(map(str.lower, text_data))
    emojis = list(map(encode_emojis, low))
    punc = list(map(remove_punc, emojis))
    tok = list(map(word_tokenize, punc))
    stop = list(map(remove_stopwords, tok))
    stemmed = list(map(stem, stop))

    # Represent X_stemmed as a single list of documents as strings with space separated tokens
    stemmed_flat = [' '.join(e for e in item) for item in stemmed]
    return stemmed_flat

### Word Count

In [118]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp

X_train_prep = preprocess(X_train)
vectorizer = CountVectorizer(max_features=3500)
X_train_count = vectorizer.fit_transform(X_train_prep)
sp.save_npz('./features/train_count.npz', X_train_count)

X_test_prep = preprocess(X_test)
X_test_count = vectorizer.transform(X_test_prep)
sp.save_npz('./features/test_count.npz', X_test_count)

### TF-IDF encoding

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

X_train_prep = preprocess(X_train)
tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train_prep)
sp.save_npz('./features/train_tfidf.npz', X_train_tfidf)

X_test_prep = preprocess(X_test)
X_test_tfidf = tfidf.transform(X_test_prep)
sp.save_npz('./features/test_tfidf.npz', X_test_tfidf)

In [122]:
# TF-IDF with bigrams
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_prep = preprocess(X_train)
tfidf_bigram = TfidfVectorizer(max_features=3500, ngram_range=(1,2))
X_train_tfidf_bigram = tfidf_bigram.fit_transform(X_train_prep)
sp.save_npz('./features/train_tfidf_bigram.npz', X_train_tfidf_bigram)

X_test_prep = preprocess(X_test)
X_test_tfidf_bigram = tfidf_bigram.transform(X_test_prep)
sp.save_npz('./features/test_tfidf_bigram.npz', X_test_tfidf_bigram)

### Word2Vec

In [149]:
from gensim.models import Word2Vec

w2v = Word2Vec(X_stemmed, min_count=3, size=50)
print(w2v)

Word2Vec(vocab=4614, size=50, alpha=0.025)


In [150]:
def doc_to_vec(tokens):
    vecs = []
    for t in tokens:
        try:
            vec = w2v[t]
            vecs.append(vec)
        except:
            pass
    if vecs != []:
        return np.array(vecs).mean(axis=0)
    else:
        return np.zeros(50)

X_w2v = [doc_to_vec(d) for d in X_stemmed]

  """


In [None]:
X_w2v

In [154]:
np.save('./features/w2v_50.npy', X_w2v)