<h1>Regression</h1>

<h2>Import Libaries</h2>

In [1]:
import csv
import numpy as np
import pandas as pd
import re
import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

<h2>Laden der Datensätze</h2>

In [117]:
#Load German Sentiment CSV file.
df_de = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/Sentiment_Analysis/Dataset_DE_Bert_Vader.csv", sep=";",
                parse_dates=["created_at"])

#Change the date field
df_de["Date"] = pd.to_datetime(df_de.created_at).apply(lambda x: x.date())

#Change the values of sentiment bert field and add a new field for a numeric value sentiment value 
df_de["sentiment_bert"] = df_de["sentiment_bert"].replace(["neutral", "negative", "positive"], ["NEU", "NEG", "POS"])
df_de["sentiment_bert_value"] = df_de["sentiment_bert"].replace({"NEU": 0, "NEG": -1, "POS": 1})

#Create a new field for a numeric sentiment value
df_de["sentiment_vader_value"] = [1 if i >= 0.05 else -1 if i <= -0.05 else 0 for i in df_de["sentiment_vader"]]

#Create a df for adding the year and week to the df
df_de_week_year = pd.DataFrame([(i.isocalendar().week, i.isocalendar().year, i.isocalendar().weekday) for i in df_en["Date"]],
                               columns=["Week", "Year", "Weekday"])

#Recalulate the week
for each in range(len(df_de_week_year)):
    if ((df_de_week_year.iloc[each]["Weekday"] > 5) & (df_de_week_year.iloc[each]["Week"] != 1)):
        df_de_week_year.iloc[each]["Week"] = df_de_week_year.iloc[each]["Week"] - 1
    elif ((df_de_week_year.iloc[each]["Weekday"] > 5) & (df_de_week_year.iloc[each]["Week"] == 1)):
        df_de_week_year.iloc[each]["Week"] = 52

#Adding the week and year information to the df
df_de[["Week", "Year"]] = df_de_week_year[["Week", "Year"]]

In [118]:
#Load English Sentiment CSV file.
df_en = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/Sentiment_Analysis/Dataset_Bert_Vader.csv", sep=";",
                parse_dates=["created_at"])

#Change the date field
df_en["Date"] = pd.to_datetime(df_en.created_at).apply(lambda x: x.date())

#Add a new column for a numeric sentiment bert value
df_en["sentiment_bert_value"] = df_en["sentiment_bert"].replace({"NEU": 0, "NEG": -1, "POS": 1})

#Add a new column for a numeric sentiment vader value
df_en["sentiment_vader_value"] = [1 if i >= 0.05 else -1 if i <= -0.05 else 0 for i in df_en["sentiment_vader"]]

#Create a df for adding the year and week to the df
df_en_week_year = pd.DataFrame([(i.isocalendar().week, i.isocalendar().year, i.isocalendar().weekday) for i in df_en["Date"]],
                               columns=["Week", "Year", "Weekday"])

#Recalulate the week
for each in range(len(df_en_week_year)):
    if ((df_en_week_year.iloc[each]["Weekday"] > 5) & (df_en_week_year.iloc[each]["Week"] != 1)):
        df_en_week_year.iloc[each]["Week"] = df_en_week_year.iloc[each]["Week"] - 1
    elif ((df_en_week_year.iloc[each]["Weekday"] > 5) & (df_en_week_year.iloc[each]["Week"] == 1)):
        df_en_week_year.iloc[each]["Week"] = 52
        
#Adding the week and year information to the df
df_en[["Week", "Year"]] = df_en_week_year[["Week", "Year"]]

In [119]:
#Combine the different DFs
df = pd.concat([df_en, df_de]).reset_index(drop=True)

In [120]:
#Change the values from sentiment_bert_value from text to a num value
df["sentiment_bert_value"] = df["sentiment_bert"].replace({"NEU": 0, "NEG": -1, "POS": 1})

#Create a df for adding the year and week to the df
df_week_year = pd.DataFrame([(i.isocalendar().week, i.isocalendar().year) for i in df["Date"]], columns=["Week", "Year"])
#Adding the week and year information to the df
df[["Week", "Year"]] = df_week_year[["Week", "Year"]]

In [121]:
#Load Wirecard finance CSV file.
df_wdi = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/WDI.HM-2.csv", parse_dates=["Date"])
df_wdi = df_wdi.rename(columns={"Adj Close": "Adj_Close"})

#Load Euro Stoxx 50 CSV file.
df_stoxx = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/^STOXX50E.csv", parse_dates=["Date"])
df_stoxx = df_stoxx.rename(columns={"Adj Close": "Adj_Close"})

#Load Visa finance CSV file.
df_visa = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/V.csv", parse_dates=["Date"])
df_visa = df_visa.rename(columns={"Adj Close": "Adj_Close"})

#Load Master Card finance CSV file.
df_ma = pd.read_csv("/Users/tobias/Dev/FOM/Master_Thesis/data/MA.csv", parse_dates=["Date"])
df_ma = df_ma.rename(columns={"Adj Close": "Adj_Close"})

<h2>Data Preparation</h2>

In [122]:
#Change Date für df_wdi & df
df_wdi["Date"] = pd.to_datetime(df_wdi.Date).apply(lambda x: x.date())
df_stoxx["Date"] = pd.to_datetime(df_stoxx.Date).apply(lambda x: x.date())
df_visa["Date"] = pd.to_datetime(df_visa.Date).apply(lambda x: x.date())
df_ma["Date"] = pd.to_datetime(df_ma.Date).apply(lambda x: x.date())

In [123]:
def create_finance_df(df):
    #Create df for returning
    df_week = pd.DataFrame(columns=["Date", "Week", "Year", "Adj_Close"])
    
    #Start Week
    week = 5
    volume = 0
    
    #To loop through all lines in df
    for each in range(len(df)):
        #Get the date
        each_date = df["Date"][each]
        adj_close = df["Adj_Close"][each]
        
        #Get the Volume of all transactions in a week
        volume = volume + df["Volume"][each]

        #Check if the date is the last date
        if df["Date"][each] == datetime.date(2020,7,31):
            #Get Week Number, start and end day
            week = each_date.isocalendar().week
            year = each_date.isocalendar().year
            end_day = df["Date"].iloc[-1]
            
            #Add data to df
            new_row = pd.DataFrame([[each_date, week, year, adj_close, volume]],
                                   columns=["Date", "Week", "Year", "Adj_Close", "Volume"])
            df_week = pd.concat([df_week, new_row], ignore_index=True)
        else:
            #Get the next week number for checking
            next_week = df["Date"][each+1].isocalendar().week

            #Check if the next day is in another week 
            if each_date.isocalendar().week != next_week:
                #Get week number, year and last date
                week = each_date.isocalendar().week
                year = each_date.isocalendar().year
                end_day = each_date
                
                #Add data to df
                new_row = pd.DataFrame([[each_date, week, year, adj_close, volume]],
                                       columns=["Date", "Week", "Year", "Adj_Close", "Volume"])
                df_week = pd.concat([df_week, new_row], ignore_index=True)
                
                #Set Volume back to 0
                volume = 0
    
    #Add the yield from one week to another
    df_week["returns_week"] = df_week.Adj_Close.pct_change()+1
    #df_week["returns_week_log"] = pd.Series(np.log(np.float64(df_week.Adj_Close))).pct_change()
    

    
    #Return the new DF
    return df_week

In [124]:
def concat_finance_df(df_wdi = df_wdi, df_stoxx = df_stoxx, df_ma = df_ma, df_v = df_visa):
    
    #Create week dfs for all financial dfs
    df_wdi_week = create_finance_df(df_wdi)
    df_stoxx_week = create_finance_df(df_stoxx)
    df_ma_week = create_finance_df(df_ma)
    df_v_week = create_finance_df(df_v)
    
    #combine all dfs
    df_week = pd.concat([df_wdi_week, df_stoxx_week.rename(columns={"returns_week": "returns_stoxx"})["returns_stoxx"]],
                        axis=1)
    df_week = pd.concat([df_week, df_ma_week.rename(columns={"returns_week": "returns_ma"})["returns_ma"]], axis=1)
    df_week = pd.concat([df_week, df_v_week.rename(columns={"returns_week": "returns_visa"})["returns_visa"]], axis=1)
    return df_week

In [125]:
def create_final_df(df_week, df, meta_data = False):
    
    #If meta data like Retweets should be used, this will calculate it.
    if meta_data:
        df["sentiment_bert_value"] = df["sentiment_bert_value"] * ((df["retweet_count"] + 1))
        df["sentiment_vader_value"] = df["sentiment_vader_value"] * ((df["retweet_count"] + 1))
    
    df_sum_year_week_bert = pd.DataFrame(df[["Week", "Year", "sentiment_bert_value"]].
                                    groupby(by=["Year", "Week"]).sum()).reset_index()

    df_sum_year_week_vader = pd.DataFrame(df[["Week", "Year", "sentiment_vader_value"]].
                                    groupby(by=["Year", "Week"]).sum()).reset_index()
    
    df_count_tweets_year_week = pd.DataFrame(df[["Week", "Year", "conversation_id"]].
                                    groupby(by=["Year", "Week"]).
             count()).reset_index().rename(columns={"conversation_id": "count_all_tweets"})
    
    df_week = df_week.merge(df_sum_year_week_bert)
    df_week = df_week.merge(df_sum_year_week_vader)
    df_week = df_week.merge(df_count_tweets_year_week)
    
    return df_week

In [126]:
def create_df(df_twitter = df.copy()):
    df_week = concat_finance_df()
    df_week_reg = create_final_df(df_week, df_twitter)
    
    df_week_reg["dif_sentiment_bert_value"] = df_week_reg["sentiment_bert_value"].diff()
    df_week_reg["dif_sentiment_vader_value"] = df_week_reg["sentiment_vader_value"].diff()

    df_week_reg.dropna(inplace=True)
    df_week_reg = df_week_reg.reset_index(drop=True)

    return df_week_reg

<h2>Linear Regression</h2>

In [127]:
def regression(df, regressor, output):
    
    x = df[regressor]
    y = df[output]
    
    x = sm.add_constant(x)

    model = sm.OLS(y,x).fit()
    
    predictions = model.predict(x)
    
    pritn_model = model.summary()
    
    print(pritn_model)
    
    return predictions, model

<h2>Execute</h2>

In [18]:
regressor = ["dif_sentiment_bert_value", "returns_stoxx", "returns_ma", "returns_visa", "Volume", "count_all_tweets"]

regressand = ["returns_week"]

df_week_reg = create_df()
        
reg_predictions, reg_model = regression(df_week_reg, regressor, regressand)

                            OLS Regression Results                            
Dep. Variable:           returns_week   R-squared:                       0.765
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     123.1
Date:                Wed, 12 Oct 2022   Prob (F-statistic):           1.73e-68
Time:                        17:45:02   Log-Likelihood:                 308.42
No. Observations:                 234   AIC:                            -602.8
Df Residuals:                     227   BIC:                            -578.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

<h2>Test</h2>

In [113]:
df_test = pd.DataFrame([(i.isocalendar().week, i.isocalendar().year, i.isocalendar().weekday) for i in df_en["Date"]],
                               columns=["Week", "Year", "Weekday"])

In [114]:
for each in range(len(df_test)):
#for each in range(15):
    if ((df_test.iloc[each]["Weekday"] > 5) & (df_test.iloc[each]["Week"] != 1)):
        df_test.iloc[each]["Week"] = df_test.iloc[each]["Week"] - 1
    elif ((df_test.iloc[each]["Weekday"] > 5) & (df_test.iloc[each]["Week"] == 1)):
        df_test.iloc[each]["Week"] = 52

In [115]:
df_test.head(15)

Unnamed: 0,Week,Year,Weekday
0,8,2016,5
1,5,2016,4
2,5,2016,4
3,14,2016,5
4,14,2016,5
5,14,2016,5
6,14,2016,4
7,14,2016,4
8,21,2016,1
9,19,2016,7


In [84]:
df_test.head(15)

Unnamed: 0,Week,Year,Weekday
0,8,2016,5
1,5,2016,4
2,5,2016,4
3,14,2016,5
4,14,2016,5
5,14,2016,5
6,14,2016,4
7,14,2016,4
8,21,2016,1
9,20,2016,7


In [116]:
len(df_test.loc[df_test["Week"] == 0])

0

In [103]:
df_test.head(15)

Unnamed: 0,Week,Year,Weekday
0,8,2016,5
1,5,2016,4
2,5,2016,4
3,14,2016,5
4,14,2016,5
5,14,2016,5
6,14,2016,4
7,14,2016,4
8,21,2016,1
9,20,2016,7
