In [145]:
import re
import nltk
import pyLDAvis
import warnings
warnings.filterwarnings('ignore')
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer 
from gensim.corpora import Dictionary
from gensim import models
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
from gensim.models import LdaModel,LdaMulticore
from functools import reduce
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

###  The data

In [146]:
df1 = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/data_error_list.csv', index_col = 'Unnamed: 0')
df2 = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/data50_new.csv', index_col = 'Unnamed: 0')
df3 = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/data75.csv', index_col = 'Unnamed: 0')
df4 = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/data100.csv', index_col = 'Unnamed: 0')
df5 = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/data25.csv', index_col = 'Unnamed: 0')

In [147]:
sp_df = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Data/sp100.csv', index_col = 'Unnamed: 0')

In [148]:
df = pd.concat([df1, df2, df3, df4, df5, sp_df], axis = 0)

In [149]:
#df = df1

In [150]:
df.shape

(26491, 105)

In [151]:
df.drop(['market_return', 'market_3_days_previous', 'market_5_days_previous'], axis = 1, inplace = True)

In [152]:
df.ticker.nunique()

160

In [153]:
df = df.reset_index().set_index(['index', 'ticker'])

### Dictionaries

In [154]:
oliveira = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Dictionaries/stock_lex_Oliveira.csv')
lm = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Dictionaries/LM/LM-SA-2020.csv')
sentic = pd.read_csv('/Users/polyanaboss/Desktop/Term paper/Dictionaries/senticnet/senticnet.csv', sep = ';')

In [155]:
df.fillna('nan', inplace = True)
df = df.replace('nan', np.NaN).dropna(how = 'all', axis = 0)
df.fillna('nan', inplace = True)

In [156]:
#getting only textual data
news = df[[col for col in df.columns if col.startswith('news')]].values.tolist()

In [157]:
df.index.get_level_values(1)

Index(['PEP', 'SBUX', 'PCAR', 'PEP', 'REGN', 'SWKS', 'SBUX', 'SNPS', 'TMUS',
       'WBA',
       ...
       'VZ', 'DIS', 'F', 'IBM', 'MA', 'MRK', 'PFE', 'T', 'UPS', 'WFC'],
      dtype='object', name='ticker', length=26491)

# Preprocessing

### Cleaning from NAN values

In [158]:
def remove_nan(news_list):
    news_no_na = []
    for i in news_list:
        i = [el for el in i if el != 'nan']
        news_no_na.append(i)
        
    return news_no_na

In [159]:
news_no_na = remove_nan(news)

### Cleaning the data

In [160]:
def clean_news(news_list):
    news_cleaned = []
    for news in news_list:
        news_per_day = []
        for text in news:
            text = re.sub('[^a-zA-Z0-9]+\s*', ' ', text) #not a number or a letter
            text = text.lower() #lowercase
            news_per_day.append(text)
        
        news_cleaned.append(news_per_day)
    
    return news_cleaned

In [161]:
news_cleaned = clean_news(news_no_na)

### Lemmatization

In [162]:
lemmatizer = WordNetLemmatizer()

In [163]:
def lemmatize_news(news_list):
    news_lemmatized = []
    for news in news_list:
        news_per_day = []
        for text in news:
            text = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]
            text = [''.join(lemma) for lemma in text]
            text = ' '.join(text)
            news_per_day.append(text)

        news_lemmatized.append(news_per_day)
        
    return news_lemmatized

In [164]:
news_lemmatized = lemmatize_news(news_cleaned)

### Stopwords

In [165]:
stopwords = stopwords.words('english')

In [166]:
def remove_stopwords(news_list):
    news_cleaned = []
    for news in news_list:
        news_per_day = []
        for text in news:
            text = [word for word in text.split(' ') if word not in stopwords]
            text = ' '.join(text)
            news_per_day.append(text)
        
        news_cleaned.append(news_per_day)
    
    return news_cleaned

In [167]:
news_cleaned = remove_stopwords(news_lemmatized)

### Dictionaries preprocessing and cleaning

#### Senticnet

In [168]:
sentic = sentic[['CONCEPT', 'POLARITY VALUE', 'POLARITY INTENSITY']]
sentic.columns = ['CONCEPT', 'POLARITY_VALUE', 'POLARITY_INTENSITY']
sentic.columns = [head.lower() for head in sentic.columns]
sentic.head()

Unnamed: 0,concept,polarity_value,polarity_intensity
0,abandon,negative,-0.391
1,abandon_theater,negative,-0.823
2,abandoned,negative,-0.458
3,abandoned_airstrip,negative,-0.771
4,abandoned_farmland,negative,-0.201


In [169]:
def split_sentic(concept):
    try:
        concept = re.sub('_', ' ', concept)
    except:
        concept = concept
    return concept

In [170]:
sentic.concept = sentic.concept.apply(lambda concept: split_sentic(concept))

In [171]:
sentic.rename(columns = {'concept': 'word'}, inplace = True)

#### Loughran-McDonald

In [172]:
lm.word = lm.word.apply(lambda concept: split_sentic(concept))

In [173]:
lm = lm[(lm['sentiment'] == 'Negative') | (lm['sentiment'] == 'Positive')]

In [174]:
lm['binary_score'] = lm['sentiment'].apply(lambda sent: 1 if sent == 'Positive' else -1)

In [175]:
lm = pd.DataFrame(lm.groupby('word')['binary_score'].mean()).sort_values('word').reset_index()

#### Oliveira

In [176]:
oliveira = oliveira[['Item', 'Aff_Score']]
oliveira.columns = ['word', 'score']

In [177]:
oliveira.drop_duplicates('word', inplace = True)

### Long format

In [178]:
tickers = df.reset_index()['ticker']

In [179]:
day_index = []
news_index = []
word_index = []
corpus = []
tickers_index = []
dates = df.reset_index()['index']
for i, day in enumerate(news_cleaned): #news_cleaned
    for j, news in enumerate(day):
        for k, text in enumerate(news.split(' ')):
            day_index.append(dates[i])
            tickers_index.append(tickers[i])
            news_index.append(j)
            word_index.append(k)
            corpus.append(news.split(' ')[k])

In [180]:
tuples = list(zip(day_index, tickers_index, news_index, word_index))

In [181]:
multindex = pd.MultiIndex.from_tuples(tuples, names = ["day", "ticker", "news", "word_count"])

In [182]:
long_news = pd.DataFrame({'word': corpus}, index = multindex)

In [183]:
long_news.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,word
day,ticker,news,word_count,Unnamed: 4_level_1
2021-03-08,PEP,0,0,mar
2021-03-08,PEP,0,1,est
2021-03-08,PEP,0,2,ce
2021-03-08,PEP,0,3,un
2021-03-08,PEP,0,4,bon


### Loughran-McDonald sentiment

In [184]:
def get_sentiment_lm(long_news):
    lm_sentiment = pd.merge(long_news, lm, how = 'left').set_index(long_news.index)
    lm_sentiment = pd.DataFrame(lm_sentiment.groupby(['day', 'ticker', 'news'])['binary_score'].mean())
    lm_sentiment = lm_sentiment.reset_index().pivot(index = ['day', 'ticker'], 
                                                values = 'binary_score', columns = 'news')
    lm_sentiment.columns = [f'sentiment_{index}' for index in lm_sentiment.columns]
    
    return lm_sentiment

In [185]:
lm_sentiment = get_sentiment_lm(long_news)

### Oliveira Sentiment

In [186]:
def get_sentiment_oliveira(long_news):
    ol_sentiment = pd.merge(long_news, oliveira, how = 'left').set_index(long_news.index)
    ol_sentiment = pd.DataFrame(ol_sentiment.groupby(['day', 'ticker', 'news'])['score'].mean())
    ol_sentiment = ol_sentiment.reset_index().pivot(index = ['day', 'ticker'], 
                                                values = 'score', columns = 'news')
    ol_sentiment.columns = [f'sentiment_{index}' for index in ol_sentiment.columns]
    
    return ol_sentiment

In [187]:
ol_sentiment = get_sentiment_oliveira(long_news)

### Sentic Sentiment

In [188]:
def get_sentiment_sentic(long_news):
    sen_sentiment = pd.merge(long_news, sentic, 
                         how = 'left')[['word', 'polarity_intensity']].set_index(long_news.index)
    sen_sentiment = pd.DataFrame(sen_sentiment.groupby(['day', 'ticker', 'news'])['polarity_intensity'].mean())
    sen_sentiment = sen_sentiment.reset_index().pivot(index = ['day', 'ticker'], 
                                                values = 'polarity_intensity', columns = 'news')
    sen_sentiment.columns = [f'sentiment_{index}' for index in sen_sentiment.columns]
    
    return sen_sentiment

In [189]:
sen_sentiment = get_sentiment_sentic(long_news)

In [190]:
def check_shapes():
    if sen_sentiment.shape == ol_sentiment.shape and sen_sentiment.shape == lm_sentiment.shape:
        return True
    else:
        return False
    
check_shapes()

True

### Imputing 

In [191]:
def imputer(sentiment_table, min_news = 20, max_news = 60):
    sentiment_table = sentiment_table.dropna(axis = 0, thresh = min_news).iloc[:, : max_news]
    impute_values = sentiment_table.mean(axis = 1)
    for i in range(len(sentiment_table)):
        sentiment_table.iloc[i, :].fillna(impute_values[i], inplace = True)
        
    return sentiment_table

### Scaling sentiments

In [192]:
def scale_sentiment(sentiment_df):
    scaler = MinMaxScaler(feature_range = [0, 1])
    scaled = pd.DataFrame(scaler.fit_transform(imputer(sentiment_df)),
                          columns = imputer(sentiment_df).columns,
                          index = imputer(sentiment_df).index)
    
    return scaled, scaler

In [286]:
ol_scaled = imputer(ol_sentiment, min_news = 5, max_news = 40)
ol_scaled, scaler_ol = scale_sentiment(ol_scaled)

lm_scaled = imputer(lm_sentiment, min_news = 5, max_news = 40)
lm_scaled, scaler_lm = scale_sentiment(lm_scaled)

sen_scaled = imputer(sen_sentiment, min_news = 5, max_news = 40)
sen_scaled, scaler_sen = scale_sentiment(sen_scaled)

### Combining sentiments

In [287]:
def intersection_index(arr1, arr2, arr3):
    # Converting the arrays into sets
    s1 = set(arr1)
    s2 = set(arr2)
    s3 = set(arr3)
      
    # Calculates intersection of 
    # sets on s1 and s2
    set1 = s1.intersection(s2)
      
    # Calculates intersection of sets
    # on set1 and s3
    result_set = set1.intersection(s3)
      
    # Converts resulting set to list
    final_list = list(result_set)
    return final_list

In [290]:
def combine_sentiments(sentiment1, sentiment2, sentiment3):
    #common indices
    indices = intersection_index(ol_scaled.index, sen_scaled.index, lm_scaled.index)
    indices = pd.MultiIndex.from_tuples(indices, names = ["day", "ticker"])
    #empty frame
    final_sentiment = pd.DataFrame(index = indices, columns = ol_scaled.columns)
    
    #fulfilling the frame
    final_sentiment = pd.DataFrame(index = indices, columns = ol_scaled.columns)
    for index in indices:
        for column in ol_scaled.columns:
            sent_value = 0.4 * ol_scaled.loc[index, column] +  0.2 * lm_scaled.loc[index, column] + 0.4 * sen_scaled.loc[index, column]
            
            final_sentiment.loc[index, column] = sent_value
            
    return final_sentiment


In [291]:
final_sentiment = combine_sentiments(ol_scaled, lm_scaled, sen_scaled)

In [293]:
final_sentiment.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment_0,sentiment_1,sentiment_2,sentiment_3,sentiment_4,sentiment_5,sentiment_6,sentiment_7,sentiment_8,sentiment_9,...,sentiment_30,sentiment_31,sentiment_32,sentiment_33,sentiment_34,sentiment_35,sentiment_36,sentiment_37,sentiment_38,sentiment_39
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021-09-24,LMT,0.340515,0.291727,0.582527,0.589531,0.707774,0.694412,0.698201,0.583492,0.583731,0.534434,...,0.556586,0.542227,0.620985,0.577153,0.574217,0.608536,0.563171,0.56963,0.55342,0.586994
2022-01-05,MRNA,0.377616,0.43792,0.573703,0.581826,0.53217,0.460641,0.716266,0.602061,0.542229,0.68658,...,0.533275,0.515325,0.596521,0.549575,0.542081,0.580673,0.539469,0.537797,0.524757,0.557752
2021-11-30,CSCO,0.523501,0.56505,0.387739,0.546809,0.6504,0.609743,0.619085,0.57864,0.575597,0.52947,...,0.545689,0.520244,0.604682,0.550966,0.538651,0.581818,0.549925,0.533894,0.524763,0.55481
2021-06-22,TGT,0.440419,0.552679,0.530801,0.313919,0.601868,0.470906,0.482044,0.507302,0.502833,0.457532,...,0.469926,0.442862,0.524441,0.46874,0.464423,0.500213,0.47154,0.457437,0.443932,0.46999
2021-06-11,TSLA,0.647576,0.491208,0.465238,0.466653,0.587661,0.497265,0.506391,0.696608,0.532438,0.412798,...,0.50067,0.476108,0.558808,0.505691,0.497388,0.536829,0.504299,0.491876,0.480297,0.509414


In [292]:
final_sentiment.shape

(4477, 40)

# Topic Modelling

In [199]:
day_index = []
news_index = []
corpus = []
tickers_index = []
for i, day in enumerate(news_cleaned): #news_cleaned
    for j, news in enumerate(day):
            day_index.append(dates[i])
            tickers_index.append(tickers[i])
            news_index.append(j)
            corpus.append(news)



tuples = list(zip(day_index, tickers_index, news_index))

multindex = pd.MultiIndex.from_tuples(tuples, names = ["day", "ticker", "news"])

news_cleaned_df = pd.DataFrame({'text': corpus}, index = multindex)

In [200]:
news_cleaned_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
day,ticker,news,Unnamed: 3_level_1
2021-03-08,PEP,0,mar est ce un bon mois pour acheter de action ...
2021-03-08,PEP,1,e marzo un buen para comprar acciones de pepsi...
2021-03-08,PEP,2,marzo un buon mese per acquistare azioni pepsi...
2021-03-08,PEP,3,maart een goede maand om pepsico aandelen te k...
2021-03-09,PCAR,0,paccar inc stock fall monday underperforms market


In [201]:
news_cleaned_df.shape

(103814, 1)

In [202]:
texts = news_cleaned_df.text.apply(lambda text: text.split(' '))
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
dictionary.filter_extremes(
    no_below = 3,
    no_above = 0.85
)


In [203]:
class TopicModelling():
    def __init__(self):
        pass
    
    def get_texts(self):
        texts = news_cleaned_df.text.apply(lambda text: text.split(' '))
        return texts
    
    #dictionary
    def get_dictionary(self):
        texts = self.get_texts()
        dictionary = Dictionary(texts)
        #filtering dict
        dictionary.filter_extremes(
        no_below = 3,
        no_above = 0.85
        )
        return dictionary
    
    #corpus from dictionary
    def get_corpus(self):
        corpus = [self.get_dictionary().doc2bow(text) for text in texts]
        return corpus
    
    #NMF
    def NMF(self, min_n = 2, max_n = 30):
        topics = range(min_n, max_n + 1, 2)
        scores_nmf = {}
        for i in topics:
            nmf = Nmf(corpus, i, chunksize = 100)
            coherence = CoherenceModel(model = nmf, texts = self.get_texts(), 
                                         dictionary = self.get_dictionary(), coherence='c_v').get_coherence()
            scores_nmf[i] = coherence
        return scores_nmf
    
    #LDA
    def LDA(self, min_n = 2, max_n = 30):
        topics = range(min_n, max_n + 1, 2)
        scores_lda = {}
        for i in topics:
            nmf = LdaMulticore(corpus, i, chunksize = 100)
            coherence = CoherenceModel(model = nmf, texts = self.get_texts(), 
                                         dictionary = self.get_dictionary(), coherence='c_v').get_coherence()
            scores_lda[i] = coherence
        return scores_lda
    
    #LSI
    def LSI(self, min_n = 2, max_n = 30):
        topics = range(min_n, max_n + 1, 2)
        scores_lsi = {}
        for i in topics:
            lsi = models.lsimodel.LsiModel(corpus, i, chunksize = 100)
            coherence = CoherenceModel(model = lsi, texts = self.get_texts(), 
                                         dictionary = self.get_dictionary(), coherence='c_v').get_coherence()
            scores_lsi[i] = coherence
        return scores_lsi
    
    

### Models

In [204]:
nmf = Nmf(corpus, num_topics = 10)

In [None]:
#hdp_model = models.hdpmodel.HdpModel(corpus, id2word = dictionary, T = 20)

In [None]:
CoherenceModel(model = hdp_model, texts = texts, 
                                         dictionary = common_dictionary, coherence='c_v').get_coherence()

In [205]:
CoherenceModel(model = nmf, texts = texts, 
                                         dictionary = dictionary, coherence='c_v').get_coherence()

0.6621356725724845

In [262]:
top_words_tuples = nmf.print_topics(num_words=5)
top_words_dict = {}
for tup in top_words_tuples:
    word_indices = [int(n) for n in re.findall('"([^"]*)"', tup[1])]
    words = [dictionary[n] for n in word_indices]
    word_weights = [float(n) for n in re.findall("\d+\.\d+", tup[1])]
    top_words_dict[tup[0]] = list(map(list, list(zip(words, word_weights))))
    


In [254]:
pd.DataFrame(top_words_dict, index=[f'top_{i}' for i in range(5)]).T

Unnamed: 0,top_0,top_1,top_2,top_3,top_4
0,"[skyworks, 0.061]","[rejected, 0.049]","[june, 0.042]","[17, 0.04]","[recovery, 0.029]"
1,"[agrees, 0.038]","[regn, 0.028]","[anticipate, 0.028]","[200, 0.026]","[already, 0.025]"
2,"[still, 0.028]","[walgreens, 0.017]","[usage, 0.016]","[perfect, 0.013]","[alliance, 0.013]"
3,"[covid, 0.28]","[oncology, 0.015]","[111, 0.013]","[thru, 0.011]","[daily, 0.01]"
4,"[ad, 0.162]","[successful, 0.028]","[nda, 0.027]","[biggest, 0.027]","[united, 0.025]"
5,"[targeting, 0.194]","[pinduoduo, 0.017]","[recovery, 0.017]","[skyworks, 0.017]","[rejected, 0.016]"
6,"[research, 0.076]","[ceo, 0.039]","[move, 0.038]","[financials, 0.027]","[scheme, 0.024]"
7,"[gilead, 0.1]","[role, 0.098]","[inching, 0.055]","[leading, 0.016]","[post, 0.014]"
8,"[competitor, 0.155]","[oncology, 0.04]","[daily, 0.016]","[cheaper, 0.015]","[xcel, 0.015]"
9,"[covid, 0.1]","[trade, 0.072]","[royal, 0.056]","[set, 0.037]","[synopsys, 0.028]"


In [210]:
def distribute_topics(model, news_cleaned_df = news_cleaned_df):
    nmf_topics = [model[corpus[i]] for i in range(len(news_cleaned_df))]
    news_cleaned_df['nmf_topics'] = nmf_topics
    news_cleaned_df.topic = news_cleaned_df.nmf_topics.apply(lambda score: 
                                 sorted(score, key = lambda x:x[1], reverse = True)[0][0])

    news_cleaned_df.nmf_topics = news_cleaned_df.groupby(['day', 'ticker']).agg({'nmf_topics': 
                                                    lambda x: x})

    news_cleaned_df.nmf_topics = news_cleaned_df.nmf_topics.apply(lambda topics:reduce(lambda x, y: x + y, topics))
    
    
    topics_sorted = []
    for i in range(len(news_cleaned_df)):
        length = len(news_cleaned_df.nmf_topics[i])
        if type(news_cleaned_df.nmf_topics[i]) != list:
            topics = []
            probas = []
            for i, value in enumerate(news_cleaned_df.nmf_topics[i]):
                if type(value) == int: 
                    topics.append(value)
                else:
                    probas.append(value)   
            news_cleaned_df.nmf_topics[i] = list(map(tuple, list(zip(topics, probas))))
        if length >= 5:
            n = sorted(news_cleaned_df.nmf_topics[i], key = lambda x:x[1], reverse = True)[:5]
        elif length > 2:
            n = sorted(news_cleaned_df.nmf_topics[i], key = lambda x:x[1], reverse = True)
        else:
            n = news_cleaned_df.nmf_topics[i]

        topics_sorted.append(n)
        
        
    topics_distribution = []
    for news in topics_sorted:
        topics_distr = [topic[0] for topic in news]
        topics_distribution.append(topics_distr)
        
    news_cleaned_df['topics'] = topics_distribution

    news_cleaned_df = news_cleaned_df.groupby(['day', 'ticker']).agg({'topics': lambda x: x[0]})

    for i in range(len(news_cleaned_df)):
        length =  len(news_cleaned_df.topics[i])
        topics = news_cleaned_df.topics[i]
        if length != 5:
            news_cleaned_df.topics[i] = topics + [-1] * (5 - length)

    for i in range(5):
        news_cleaned_df[f'topic_{i}'] = news_cleaned_df['topics'].apply(lambda x: x[i])
        
    news_cleaned_df.drop('topics', axis = 1, inplace = True)
    
    return news_cleaned_df
    

In [211]:
topics = distribute_topics(nmf)

In [212]:
topics

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_0,topic_1,topic_2,topic_3,topic_4
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-04,AAPL,2,2,2,2,2
2021-03-04,AMGN,2,9,2,0,7
2021-03-04,AMZN,2,2,2,2,2
2021-03-04,ANSS,2,3,6,4,7
2021-03-04,BIIB,2,2,1,0,7
...,...,...,...,...,...,...
2022-03-01,MRK,0,6,0,9,6
2022-03-01,PFE,0,0,0,0,0
2022-03-01,T,2,8,6,0,2
2022-03-01,UPS,6,2,3,0,1


### Number of topics

In [266]:
TopicModelling().NMF(2, 10)

{2: 0.690641113620712,
 4: 0.6591180613699776,
 6: 0.6468224956128372,
 8: 0.6730862973189569,
 10: 0.6651128624216931}

In [284]:
for i in range(2, 11,2):
    lsi = models.lsimodel.LsiModel(corpus,num_topics=i)
    score = CoherenceModel(model = lsi, texts = texts, 
                                         dictionary = dictionary, coherence='c_v').get_coherence()
    
    print(score)

0.7200340789289004
0.6766447390767569
0.67859995420348
0.6979219690809666
0.691441548278682


### Sentiments + Topics + Financial Indicators Concatenation

In [213]:
financials = df[[col for col in df.columns if 'news' not in col]]
financials = financials.reset_index().rename(columns = {'index' : 'day'}).set_index(['day', 'ticker'])
financials.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,adjclose,ma_10,ma_20,ma_30,ema_12,ema_26,rsi_6_sma,rsi_6_ema,rsi_12_sma,rsi_24_sma,target,trend,target_3_days_previous,target_5_days_previous
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-03-08,PEP,129.404297,127.362002,129.955677,131.571941,128.28194,130.476961,71.375403,44.685097,48.945097,45.900743,0,14,0.333333,0.2
2021-03-08,SBUX,103.238434,103.153905,103.26861,101.790343,103.37852,102.63326,59.97692,61.102845,52.272718,64.247023,0,13,0.0,0.0
2021-03-09,PCAR,92.200859,90.472639,92.207153,92.008013,90.990975,90.996424,62.827155,52.03783,45.591027,54.200355,0,46,0.0,0.0
2021-03-09,PEP,129.521805,127.478115,129.577964,131.396891,128.454613,130.397044,70.658179,53.309848,43.732052,43.823236,0,13,0.333333,0.2
2021-03-09,REGN,470.630005,456.917001,469.624998,483.584331,461.829836,473.385318,63.941681,47.491796,42.011082,33.531447,1,1,1.0,0.6


In [214]:
final_sentiment.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment_0,sentiment_1,sentiment_2,sentiment_3,sentiment_4,sentiment_5,sentiment_6,sentiment_7,sentiment_8,sentiment_9,...,sentiment_50,sentiment_51,sentiment_52,sentiment_53,sentiment_54,sentiment_55,sentiment_56,sentiment_57,sentiment_58,sentiment_59
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021-09-21,AMZN,0.544879,0.257498,0.408812,0.368385,0.383385,0.416132,0.505084,0.382462,0.351178,0.252757,...,0.432567,0.39017,0.42252,0.399318,0.431274,0.413506,0.360645,0.371753,0.394891,0.384053
2021-09-02,MRNA,0.11725,0.544065,0.593325,0.342374,0.353894,0.364903,0.458443,0.279531,0.437768,0.80797,...,0.394522,0.338536,0.41396,0.376794,0.403526,0.393114,0.323481,0.330641,0.343882,0.351716
2021-08-23,JD,0.257559,0.432068,0.594154,0.38258,0.363372,0.735607,0.431549,0.390586,0.427566,0.317445,...,0.439785,0.390341,0.434928,0.409699,0.439884,0.426104,0.371125,0.373449,0.404135,0.399308
2021-11-16,TSLA,0.259288,0.489863,0.543517,0.36388,0.386552,0.575429,0.386898,0.479094,0.464529,0.43733,...,0.434836,0.385796,0.428566,0.40403,0.434419,0.42043,0.3663,0.368383,0.400037,0.391843
2021-06-11,TSLA,0.623975,0.498603,0.466236,0.469523,0.636301,0.523902,0.564498,0.687757,0.525219,0.421298,...,0.535316,0.4889,0.530391,0.504917,0.535662,0.520131,0.464654,0.472194,0.496923,0.526663


In [215]:
topics.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_0,topic_1,topic_2,topic_3,topic_4
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-04,AAPL,2,2,2,2,2
2021-03-04,AMGN,2,9,2,0,7
2021-03-04,AMZN,2,2,2,2,2
2021-03-04,ANSS,2,3,6,4,7
2021-03-04,BIIB,2,2,1,0,7


In [294]:
financials.shape

(26491, 14)

In [295]:
final_sentiment.shape

(4477, 40)

In [296]:
topics.shape

(26334, 5)

In [297]:
final_df = pd.concat([financials, final_sentiment, topics], axis = 1)

In [298]:
final_df.drop(columns = [column for column in final_df.columns if column.startswith("news")], 
             axis = 1, inplace = True)
final_df.dropna(how = 'any', inplace = True)

In [304]:
#int topic
final_df[[col for col in final_df.columns if 'topic' in col]] = final_df[[col for col in 
                                                                          final_df.columns if 'topic' 
                                                                          in col]].apply(lambda col: col.astype('int'))

In [305]:
final_df.shape

(4477, 59)

In [306]:
final_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,adjclose,ma_10,ma_20,ma_30,ema_12,ema_26,rsi_6_sma,rsi_6_ema,rsi_12_sma,rsi_24_sma,...,sentiment_35,sentiment_36,sentiment_37,sentiment_38,sentiment_39,topic_0,topic_1,topic_2,topic_3,topic_4
day,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021-03-04,AAPL,119.420219,124.660082,129.457326,131.775941,125.732602,128.423225,39.080453,33.325382,25.560048,29.139135,...,0.467013,0.433376,0.424253,0.410886,0.439668,2,2,2,2,2
2021-03-04,AMZN,2977.570068,3150.865967,3226.814465,3247.276636,3148.52956,3196.76747,24.214878,26.776597,27.150401,38.840834,...,0.518054,0.489071,0.472463,0.461305,0.488746,2,2,2,2,2
2021-03-04,NFLX,511.290009,542.648999,548.110004,549.791003,543.116656,543.017493,29.976383,44.271572,32.122175,49.305999,...,0.532548,0.510936,0.481093,0.474581,0.500493,2,7,6,2,2
2021-03-05,AMGN,220.64682,220.581917,224.16998,228.93396,220.952093,224.423269,17.207729,21.321361,25.914517,24.227511,...,0.560588,0.51162,0.527688,0.506599,0.539568,2,2,8,3,8
2021-03-05,AMZN,3000.459961,3115.799976,3210.066467,3237.749642,3122.227793,3180.443527,24.72216,26.121193,26.326743,37.522669,...,0.454547,0.393381,0.453038,0.44455,0.472122,2,2,2,2,2


In [307]:
final_df.to_csv('/Users/polyanaboss/Desktop/Term paper/Data/processed_data_v.1.0.csv')