In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('/Users/macbook/Documents/PhD_Documents/sentiment_neg_pos_effect/sentiments/results/sentiment_neg_pos.csv')

# Drop the unnamed column if it exists
data = data.drop('Unnamed: 0', axis=1)

# Convert the 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort the DataFrame by the 'Date' column
data = data.sort_values(by='Date')

# Display the sorted DataFrame
print(data)

          Date  mean_positive_sentiment_crude  mean_negative_sentiment_crude  \
0   2014-01-01                       0.973447                      -0.594065   
121 2014-01-02                       0.753283                      -0.814793   
242 2014-01-03                       0.740272                      -0.861891   
275 2014-01-04                       0.867800                      -0.872911   
286 2014-01-05                       0.686465                      -0.844081   
..         ...                            ...                            ...   
219 2024-01-27                       0.810014                      -0.655089   
230 2024-01-28                       0.705428                      -0.712812   
241 2024-01-29                       0.768802                      -0.736254   
263 2024-01-30                       0.597846                      -0.748356   
274 2024-01-31                       0.644918                      -0.755079   

     mean_positive_sentiment_textblob  

In [3]:
# Load data from CSV file
df = pd.read_csv('/Users/macbook/Documents/PhD_Documents/Second_paper/Brent_data/cb-5m.csv', delimiter=';', names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume'])

# Convert 'Date' and 'Time' columns to datetime format
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
df['Date'] = df['Datetime'].dt.date  # Keep only the date part

# Calculate returns (log returns)
df['Return'] = np.log(df['Close'] / df['Close'].shift(1))

# Scaling factor
scaling_factor = 100

# Calculate realized volatility (RV)
df['RV'] = (df['Return']**2) * scaling_factor
realized_volatility = df.groupby('Date')['RV'].sum().reset_index()
realized_volatility.columns = ['Date', 'RV']

# Calculate bipower variation (BVP)
df['Abs_Return'] = df['Return'].abs()
df['BVP'] = (df['Abs_Return'].shift(1) * df['Abs_Return']) * scaling_factor
bipower_variation = df.groupby('Date')['BVP'].sum().reset_index()
bipower_variation.columns = ['Date', 'BVP']

# Calculate negative and positive realized volatility (RV_neg, RV_pos)
df['RV_neg'] = np.where(df['Return'] < 0, df['Return']**2 * scaling_factor, 0)
df['RV_pos'] = np.where(df['Return'] > 0, df['Return']**2 * scaling_factor, 0)

neg_realized_volatility = df.groupby('Date')['RV_neg'].sum().reset_index()
neg_realized_volatility.columns = ['Date', 'RV_neg']

pos_realized_volatility = df.groupby('Date')['RV_pos'].sum().reset_index()
pos_realized_volatility.columns = ['Date', 'RV_pos']

# Calculate realized quadratic variation (RQ)
df['RQ'] = (df['Return']**4) * (scaling_factor**2)
realized_quadratic_variation = df.groupby('Date')['RQ'].sum().reset_index()
realized_quadratic_variation.columns = ['Date', 'RQ']

# Merge all the calculations into one DataFrame
result = pd.merge(realized_volatility, bipower_variation, on='Date')
result = pd.merge(result, neg_realized_volatility, on='Date')
result = pd.merge(result, pos_realized_volatility, on='Date')
result = pd.merge(result, realized_quadratic_variation, on='Date')

# Ensure the Date column is in the format y-m-d
result['Date'] = pd.to_datetime(result['Date']).dt.strftime('%Y-%m-%d')

# Print or save the result
print(result)

            Date        RV       BVP    RV_neg    RV_pos        RQ
0     2009-01-26  0.005191  0.003327  0.000677  0.004513  0.000010
1     2009-01-27  0.234837  0.145885  0.150415  0.084422  0.001693
2     2009-01-28  0.264549  0.163445  0.133203  0.131345  0.002055
3     2009-01-29  0.176646  0.111705  0.081444  0.095202  0.000632
4     2009-01-30  0.137579  0.082873  0.072544  0.065035  0.000505
...          ...       ...       ...       ...       ...       ...
4896  2024-10-24  0.029647  0.016122  0.017647  0.012000  0.000022
4897  2024-10-25  0.019963  0.010313  0.007364  0.012599  0.000021
4898  2024-10-27  0.252110  0.008413  0.246393  0.005717  0.059086
4899  2024-10-28  0.031861  0.020121  0.016979  0.014882  0.000021
4900  2024-10-29  0.026351  0.015339  0.014197  0.012155  0.000016

[4901 rows x 6 columns]


In [4]:
result['Date'] = pd.to_datetime(result['Date'])  # 'errors=coerce' will turn invalid parsing into NaT
data['Date'] = pd.to_datetime(data['Date'])

In [5]:
merged_df = pd.merge(data, result, on='Date', how='inner')
merged_df

Unnamed: 0,Date,mean_positive_sentiment_crude,mean_negative_sentiment_crude,mean_positive_sentiment_textblob,mean_negative_sentiment_textblob,mean_positive_sentiment_finbert,mean_negative_sentiment_finbert,mean_positive_sentiment_vader,mean_negative_sentiment_vader,RV,BVP,RV_neg,RV_pos,RQ
0,2014-01-01,0.973447,-0.594065,0.000000,-0.375000,0.000000,-0.487912,0.000000,-0.578400,0.000384,0.000167,0.000081,0.000303,2.217886e-08
1,2014-01-02,0.753283,-0.814793,0.014358,-0.435156,0.154781,-0.753325,0.082689,-0.497700,0.013107,0.007369,0.009998,0.003109,6.724529e-06
2,2014-01-03,0.740272,-0.861891,0.020514,-0.441167,0.145103,-0.794251,0.056030,-0.403758,0.007223,0.004348,0.004271,0.002952,1.059259e-06
3,2014-01-05,0.686465,-0.844081,0.034091,-0.700000,0.081360,-0.878195,0.000000,-0.446837,0.000858,0.000538,0.000447,0.000411,7.489021e-08
4,2014-01-06,0.691253,-0.752363,0.031435,-0.419083,0.157888,-0.790936,0.107577,-0.393614,0.005761,0.003246,0.003174,0.002588,8.342243e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3130,2024-01-26,0.694053,-0.756394,0.048078,-0.457264,0.113659,-0.794788,0.125216,-0.449918,0.025413,0.015731,0.011267,0.014146,2.097258e-05
3131,2024-01-28,0.705428,-0.712812,0.076291,-0.346561,0.073942,-0.710787,0.102755,-0.476406,0.021604,0.007506,0.005933,0.015671,1.669870e-04
3132,2024-01-29,0.768802,-0.736254,0.041382,-0.356612,0.110573,-0.821255,0.117357,-0.462254,0.032732,0.018419,0.020283,0.012448,2.846975e-05
3133,2024-01-30,0.597846,-0.748356,0.043063,-0.412404,0.064214,-0.672898,0.100153,-0.414560,0.025793,0.014901,0.011995,0.013798,1.249416e-05


In [6]:
merged_df.to_csv('sentiment_pos_neg_Brent.csv')