# Generating original factor data (delayed reporting assumption) as required by alphalens for analysis

In [1]:
import pickle
from itertools import product
import pandas as pd

In [2]:
with open('nifty_next_tickers.pickle', 'rb') as f:
    tickers = pickle.load(f)

In [3]:
prices = pd.read_csv('concatenated_prices.csv', index_col = 0)
prices.index = pd.to_datetime(prices.index, utc = True, infer_datetime_format= True)

In [4]:
# setting variables required for generating factor data
year_dict = {2011:'2011-06-15', 2012:'2012-06-15', 2013:'2013-06-16', 2014:'2014-06-16', 2015:'2015-06-15',
             2016:'2016-06-15', 2017:'2017-06-15', 2018:'2018-06-15', 2019:'2019-06-17', 2020:'2020-01-01', }

sentiments = ['Negative', 'Positive', 'Litigious', 'Uncertainty',
              'StrongModal', 'WeakModal', 'Constraining']

factor_path = 'Sentiment_analysis.xlsx'

idx = pd.date_range(start = '2011-03-01', end = '2020-01-31', tz = 'utc')

In [5]:
# forward filling empty cells with the year's sentiment values until new reports are analyzed
def ffill_sentiments(df):
    df = df.groupby(level='Asset').apply(lambda x: x.reset_index(level='Asset',drop=True).reset_index().set_index('Date').reindex(idx,method='ffill')).swaplevel().reset_index().rename(columns = {'level_0': 'Date'}).set_index(['Date', 'Asset'])
    return df

In [6]:
# creating factor data 
def generate_factor_df(tickers_list, prices_df, factor_path):
    
    all_dates = prices_df.index.tolist()
    indexes = list(product(all_dates, tickers_list))
    indexes = pd.MultiIndex.from_tuples(indexes)

    factor_df = pd.DataFrame(index = indexes, columns = sentiments)
    factor_df.reset_index(inplace = True)
    factor_df.rename(columns = {'level_0': 'Date', 'level_1':'Asset'}, inplace = True)
    factor_df.set_index(['Date', 'Asset'], inplace = True)
  
    for ticker in tickers_list:
    
        stock_df = pd.read_excel(factor_path, sheet_name = ticker, index_col = 0)

        year_values= stock_df.index.tolist()
        year_values = [year_dict.get(item,item)  for item in year_values]
        year_values = pd.to_datetime(year_values, utc = True, infer_datetime_format= True)
        
        stock_name = [ticker]
        stock_index_list = list(product(year_values, stock_name))
        stock_index = pd.MultiIndex.from_tuples(stock_index_list)

        stock_df_new = pd.DataFrame(data = stock_df.values,index = stock_index, columns = stock_df.columns)
        
        stock_df_new.reset_index(inplace = True)
        stock_df_new.rename(columns = {'level_0': 'Date', 'level_1': 'Asset'}, inplace = True)
        stock_df_new.set_index(['Date', 'Asset'], inplace = True)
        
        stock_df_new = ffill_sentiments(stock_df_new)

        factor_df =factor_df.combine_first(stock_df_new)
        
    return factor_df

factor_path = 'Sentiment_analysis.xlsx'    
factor_df = generate_factor_df(tickers, prices, factor_path)

In [7]:
# assigning the negative weights to negative sentiments
factor_df['Negative'] = - factor_df['Negative']
factor_df['Uncertainty'] = -factor_df['Uncertainty']
factor_df['WeakModal'] = -factor_df['WeakModal']
factor_df['Constraining'] = -factor_df['Constraining']

In [8]:
# defining sentiment factors
factor_df['Crude_Sentiment'] = factor_df['Positive'] + factor_df['Negative']
factor_df['Sentiment']= factor_df['Positive'] + factor_df['Negative'] +factor_df['Uncertainty']+ factor_df['WeakModal'] +factor_df['StrongModal'] + factor_df['Constraining']

In [9]:
factor_df = factor_df.astype('float')
factor_df.dtypes

Negative           float64
Positive           float64
Litigious          float64
Uncertainty        float64
StrongModal        float64
WeakModal          float64
Constraining       float64
Crude_Sentiment    float64
Sentiment          float64
dtype: object

In [24]:
factor_df.to_csv('delayed_factor_data.csv')