# Biased Clustering

This notebook uses time-biased clustering to detect trends in financial journals.

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
import sys
sys.path.append('/content/drive/My Drive/Trending-Topics-Dashboard-main')

In [35]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Biased_Clusters import get_clusters_dist, get_clusters_timeline, get_top_keywords, get_silhouette, cal_cluster_bias

In [36]:
# load cleaned data
df = pd.read_csv('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2020,1.243352,169.971142,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capit structur', 'corpor taxat', 'difference...",5,3
1,2020,1.243352,169.971142,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spread', 'lbo risk', 'structur model'...",4,3
2,2020,1.243352,169.971142,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sale', 'liquid manag', 'mutual fund']",3,3
3,2020,1.243352,169.971142,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset price', 'leverag constraint', 'lotteri...",5,3
4,2020,1.243352,169.971142,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3


## Biased Clustering

In [37]:
# build trend score table

# load training data
x_vector = np.load('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/x_vector.npy')

# steps = [i for i in range(1,1000, 50)]
data = []

# get Silhouette score, std year for various bias amount
for m in tqdm(range(1,1000,10)):
    m = m*.01

    try:
        data.append(get_silhouette(df, x_vector, m))
    except ValueError:
        continue

# create a dataframe to store the results
df_result = pd.DataFrame(data)

# save df_result to csv file
#df_result.to_csv('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/table_trend_score.csv', index=False)

# display the first 5 rows of df_result
df_result.head()

100%|██████████| 100/100 [14:08<00:00,  8.48s/it]


Unnamed: 0,step,Silhouette Score,silhouette_clsuter_1,std_year_cluster_1,silhouette_by_std_year_cluster_1,silhouette_clsuter_2,std_year_cluster_2,silhouette_by_std_year_cluster_2,silhouette_clsuter_3,std_year_cluster_3,silhouette_by_std_year_cluster_3,silhouette_clsuter_4,std_year_cluster_4,silhouette_by_std_year_cluster_4,silhouette_clsuter_5,std_year_cluster_5,silhouette_by_std_year_cluster_5,silhouette_clsuter_6,std_year_cluster_6,silhouette_by_std_year_cluster_6,silhouette_clsuter_7,std_year_cluster_7,silhouette_by_std_year_cluster_7,silhouette_clsuter_8,std_year_cluster_8,silhouette_by_std_year_cluster_8,silhouette_clsuter_9,std_year_cluster_9,silhouette_by_std_year_cluster_9,silhouette_clsuter_10,std_year_cluster_10,silhouette_by_std_year_cluster_10,silhouette_clsuter_11,std_year_cluster_11,silhouette_by_std_year_cluster_11,silhouette_clsuter_12,std_year_cluster_12,silhouette_by_std_year_cluster_12,silhouette_clsuter_13,std_year_cluster_13,silhouette_by_std_year_cluster_13,silhouette_clsuter_14,std_year_cluster_14,silhouette_by_std_year_cluster_14,silhouette_clsuter_15,std_year_cluster_15,silhouette_by_std_year_cluster_15,std_tfidf,avg_std,avg_std_year,avg_silhouette_by_std_year
0,0.01,0.426976,0.639289,1.051124,0.608196,0.111964,0.961931,0.116395,0.469448,1.039308,0.451693,0.536934,0.737362,0.728183,0.206869,1.048603,0.197281,0.318409,0.941573,0.338167,0.484767,0.842938,0.575092,0.491296,0.827417,0.59377,0.50901,0.832527,0.611403,0.170283,1.258417,0.135315,0.50692,0.933791,0.542863,0.438857,0.995484,0.440848,0.351704,0.805251,0.436764,0.349458,0.866305,0.403389,0.169355,1.166504,0.145181,1.126788,0.153826,0.953902,0.421636
1,0.11,0.387994,0.304572,0.908266,0.335334,-0.023545,1.052159,-0.022378,0.200095,1.158656,0.172696,0.232185,1.184489,0.196021,0.515727,0.838919,0.614752,0.493761,0.941886,0.524226,0.121806,0.936164,0.130112,0.479121,0.839863,0.570475,0.175052,1.166866,0.150019,0.437962,0.995484,0.439949,0.358807,0.808681,0.443694,0.545578,1.061642,0.5139,0.450371,1.048909,0.429371,0.535182,0.737362,0.725806,0.485731,0.823872,0.589571,1.118986,0.170104,0.966881,0.38757
2,0.21,0.428537,0.302718,1.163803,0.260111,0.634955,1.049658,0.604916,0.32905,0.821603,0.400498,0.15686,1.288925,0.121698,0.0895,1.115216,0.080253,0.511574,0.834247,0.613216,0.497876,0.9305,0.535063,0.460629,1.047158,0.439884,0.475062,0.820219,0.579189,0.525566,0.73471,0.715338,0.425032,0.989656,0.429475,0.35243,0.856304,0.411572,0.474541,0.841448,0.563957,0.274199,0.94152,0.29123,0.213754,1.0822,0.197518,1.127641,0.147469,0.967811,0.416261
3,0.31,0.424374,0.307023,0.924397,0.332134,0.46794,0.852444,0.54894,0.638029,1.050148,0.60756,0.341838,0.796989,0.428912,0.171558,1.184607,0.144823,0.52485,0.73471,0.714363,0.106301,1.149564,0.09247,0.439459,0.995484,0.441453,0.49863,0.943511,0.528483,0.240466,1.096561,0.219291,0.472631,0.822033,0.574953,0.508344,0.816247,0.622782,0.477128,1.040679,0.458478,0.232174,1.016728,0.228354,0.197083,1.192212,0.165308,1.123022,0.152079,0.974421,0.40722
4,0.41,0.424526,0.635113,1.050906,0.604348,0.304386,1.171131,0.259907,0.476076,0.852456,0.558476,0.403633,0.765525,0.527263,0.511464,0.809812,0.631584,0.495182,0.827417,0.598468,0.456696,1.049305,0.435237,0.286274,0.924942,0.309505,0.429472,0.989656,0.433961,0.507809,0.932151,0.544771,0.537625,0.737362,0.729119,0.131165,1.106286,0.118563,0.09341,1.261849,0.074026,0.089671,0.941426,0.09525,0.234744,1.052039,0.223133,1.123121,0.166955,0.964817,0.409574


In [38]:
import json
import scipy.sparse

# load terms sparse matrix
terms_sparse_matrix = scipy.sparse.load_npz('/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_sparse_matrix.npz')

# load terms label
with open("/content/drive/MyDrive/Trending-Topics-Dashboard-main/data/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

terms_sparse_matrix = pd.DataFrame(terms_sparse_matrix.toarray())

# make predictions
summary, predictions = cal_cluster_bias(df, x_vector, terms_sparse_matrix, terms_label, 0.57)

In [39]:
summary

Unnamed: 0,Topic Id,Terms,Timeline,Number of Articles,Article %,Trend Score,Silhouette Score,Bias Avg Std Year
0,1,"bond, privat, risk, extern, lender, increas, f...","1975, 1979, 1981-2020",212,7.130844,0.562223,0.468861,0.833942
1,2,"bond, firm, rate, predict, find, use, relat, v...",1974-2020,812,27.312479,0.255452,0.25729,1.007196
2,3,"time, tradeoff, invest, shyamsund, bias, close...","1994, 1998, 2000, 2003, 2007, 2011, 2014, 2019...",9,0.302725,1.043938,0.770377,0.737953
3,4,"abnorm, use, option, find, posit, news, repurc...",1974-2020,280,9.418096,0.10298,0.124088,1.204965
4,5,"cds, show, bond, counterparti, good, risk, ori...","2000, 2005, 2008, 2010, 2012-2015, 2017, 2019-...",17,0.571813,1.506494,0.699214,0.464133
5,6,"use, show, flow, effect, model, paper, market,...","1976-1977, 1979-1986, 1989, 1992, 1995-1997, 2...",117,3.935419,0.359014,0.361255,1.00624
6,7,"find, higher, manag, liquid, competit, effect,...","1978, 1997, 2001, 2005, 2007-2008, 2010-2020",27,0.908174,1.011745,0.706459,0.698258
7,8,"equiti, paper, provid, invest, stock, privat, ...","1977, 1981, 1984, 1986, 1989-1990, 1992, 1995-...",129,4.339051,0.595327,0.544959,0.915395
8,9,"model, institut, banker, auction, firm, market...","1986-1987, 1989-1990, 1993, 1995-1999, 2001-20...",57,1.917255,0.595735,0.493749,0.828806
9,10,"paper, increas, volatil, find, effect, share, ...","1974-1977, 1979-1981, 1983-1987, 1989, 1991-19...",104,3.49815,0.047401,0.057251,1.207786


In [40]:
predictions.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned,Topic Id
0,2020,1.243352,169.971142,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capit structur', 'corpor taxat', 'difference...",5,3,5
1,2020,1.243352,169.971142,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spread', 'lbo risk', 'structur model'...",4,3,4
2,2020,1.243352,169.971142,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sale', 'liquid manag', 'mutual fund']",3,3,8
3,2020,1.243352,169.971142,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset price', 'leverag constraint', 'lotteri...",5,3,2
4,2020,1.243352,169.971142,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3,1


In [41]:
get_clusters_dist(predictions).head()

Unnamed: 0,Year,Topic Id,Number of Articles
0,1974,2,5
1,1974,4,5
2,1974,10,1
3,1974,11,1
4,1974,12,1


In [None]:
# save prediction
summary.to_csv('data/summary_bias_0.57.csv', index=False)
predictions.to_csv('data/predictions_0.57.csv', index=False)