# Feature Engineering

This notebook prepares data for model training.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from mlxtend.preprocessing import DenseTransformer
from sklearn.cluster import KMeans
from tqdm import tqdm

## Load Data

In [3]:
# load pre-processed data
df = pd.read_csv('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2020,1.243352,169.971142,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1051,"['capit structur', 'corpor taxat', 'difference...",5,3
1,2020,1.243352,169.971142,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,584,"['credit spread', 'lbo risk', 'structur model'...",4,3
2,2020,1.243352,169.971142,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sale', 'liquid manag', 'mutual fund']",3,3
3,2020,1.243352,169.971142,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset price', 'leverag constraint', 'lotteri...",5,3
4,2020,1.243352,169.971142,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,744,"['gender gap', 'entrepreneurship', 'angel inve...",4,3


## Hyperparameter Tuning

In [4]:
from tqdm import tqdm
from sklearn.metrics import silhouette_score


lda_n_compo = [20, 30, 40, 50, 60]     # a list of LDA's number of components
lda_topic_word = [None, 15, 20, 25]    # a list of number of terms
svd = [8, 10, 12, 15]                  # a list of SVD's number of components
params_result = []

# iterate through every item in the lda_n_compo list
# run KMeans model for each selected set of parameters
for ldn in tqdm(lda_n_compo):
    for ldt in lda_topic_word: 
        for sn in svd:
            try:
                if sn >= ldn or ldt >= ldn:
                    continue
            except TypeError:
                pass

            # build a data pipeline
            p = [('vect', CountVectorizer()),
                 ('lda', LatentDirichletAllocation(n_components=ldn,random_state=42,topic_word_prior=ldt)),
                 ('tfidf', TfidfTransformer())]
            
            # include dimensionality reduction in the data pipeline
            if sn > 0:
                p.append(('svd', TruncatedSVD(n_components=sn)))
            p.append(('to_dense', DenseTransformer()))
            c_pipe = Pipeline(p)
            
            # create a vector representation of abstracts
            vec = c_pipe.fit_transform(df.Abstract_Cleaned)
            # normalize vector
            vec = Normalizer().fit_transform(vec)
            vec = pd.DataFrame(vec, index=None)
            vec = vec / vec.std()
            vec = vec.to_numpy()
            
            # build a KMeans model and compute Silhouette score
            model = KMeans(n_clusters=15,random_state=42)
            predicted = model.fit_predict(vec)
            score = silhouette_score(vec, predicted)
            params_result.append((ldn, ldt, sn, score))

# create a dataframe of the parameter tuning results
# the best model should have a highest Silhouette score
df_params = pd.DataFrame(params_result, columns=['LDA','LDA_topic_word','SVD','Score'])
df_params.sort_values(['Score'], ascending=False).head(10)

100%|██████████| 5/5 [30:27<00:00, 365.50s/it]


Unnamed: 0,LDA,LDA_topic_word,SVD,Score
40,50,,8,0.426826
42,50,,12,0.426752
10,30,,12,0.426279
24,40,,8,0.425627
8,30,,8,0.425115
58,60,,12,0.424852
56,60,,8,0.424584
41,50,,10,0.420403
25,40,,10,0.418945
11,30,,15,0.415673


## Build Vector Representation of Abstracts

In [5]:
# build a data pipeline
pipeline = Pipeline([('vect', CountVectorizer()),                                 # bag-of-words
                     ('lda', LatentDirichletAllocation(n_components=50,           # topic modeling
                                                       random_state=42,
                                                       topic_word_prior=None)),
                     ('tfidf', TfidfTransformer()),
                     ('svd', TruncatedSVD(n_components=10, random_state=42)),     # components reduction
                     ('to_dense', DenseTransformer())])                           # data transform

# build data vector representation of abstracts
x_vector = pipeline.fit_transform(df.Abstract_Cleaned)

In [6]:
# normalize x_vector
x_vector = Normalizer().fit_transform(x_vector)
df_x_vector = pd.DataFrame(x_vector, index=None)
df_x_vector = df_x_vector / df_x_vector.std()
x_vector = df_x_vector.to_numpy()

# display x_vector
x_vector

array([[ 3.51684492, -1.4231189 , -0.7143623 , ..., -0.22810582,
        -0.09721918, -0.05247422],
       [ 3.46905472, -1.00494984, -0.72849453, ..., -0.26782226,
        -0.20709236, -0.19767628],
       [ 1.80852848,  1.31183921,  0.47936165, ...,  0.70616742,
         2.20730492,  1.20695244],
       ...,
       [ 3.47482323, -1.48093636, -0.7217774 , ..., -0.25914228,
        -0.15969241, -0.08481553],
       [ 0.86386651,  0.34557555,  0.06065085, ...,  4.148572  ,
        -0.4945658 ,  0.08004804],
       [ 1.20078231,  1.19394158,  1.0120293 , ..., -0.26140763,
        -0.22298872, -0.35119085]])

In [7]:
# save x_vector for model development
with open('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/x_vector.npy', 'wb') as file:
    np.save(file, x_vector)

### Build Terms Matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
import json

# create TF-IDF transformer
tfidf = TfidfVectorizer()
tfidf.fit(df.Abstract_Cleaned)

# build terms matrix for entire corpus
terms_sparse_matrix = tfidf.transform(df.Abstract_Cleaned)

# save terms sparse matrix
# to load sparse matrix: sparse_matrix = scipy.sparse.load_npz('data/terms_sparse_matrix.npz')
scipy.sparse.save_npz('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_sparse_matrix.npz', terms_sparse_matrix)

# get term label for each item in the term matrix and save result to a text file
terms_label = tfidf.get_feature_names()

# save term labels
with open("/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_label.txt", "w") as fp:
    json.dump(terms_label, fp)

