In [None]:
# Making necessary imports
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
import random
import time
from hmmlearn import hmm
import numpy as np
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import nltk
nltk.download('words')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from abc import ABC, abstractmethod

In [None]:
# reading the dataset
djia_dataset = pd.read_csv("DJA.csv")

In [None]:
djia_dataset = djia_dataset.dropna()

In [None]:
djia_dataset

In [None]:
# reading the dataset
snp = pd.read_csv("SNP.csv")
snp = snp[1075:1829]
nasdaq = pd.read_csv("NASDAQCOM.csv")

In [None]:
# Plot actual trend of data
fig = plt.figure(figsize=(10,8))
plt.plot(djia_dataset["DJIA"].tolist(), label='DJIA')
plt.plot(nasdaq["Close"].tolist(), label='NASDAQ')
plt.plot(snp["Close/Last"].tolist(), label='SNP500')
plt.legend(loc='upper left')
plt.xlabel("Date")
plt.ylabel("Stock Close Price")

In [None]:
# Function to calculate the variation
def find_variation(close_prices):
    variation = [0]
    for i in range(1, len(close_prices)):
        variation.append(((close_prices[i] - close_prices[i-1])/close_prices[i])*100)
    return variation

In [None]:
# Calculating and displaying variation
djia_variation = find_variation(djia_dataset["DJIA"].tolist())
snp_variation = find_variation(snp["Close/Last"].tolist())
nasdaq_variation = find_variation(nasdaq["Close"].tolist())
fig = plt.figure(figsize=(10,6))
plt.plot(djia_variation, label = 'DJIA Close Price Variation')
plt.plot(nasdaq_variation, label = 'NASDAQ Close Price Variation')
plt.plot(snp_variation, label='SNP Close Price Variation')
plt.legend(loc='upper right')
plt.xlabel('Time Period/Date')
plt.ylabel('Variation')

In [None]:
# Function to generate random transition matrix and starting probability

def generate_random_transition_matrix():
    k = 8
    result = [[random.uniform(0, 0.1 / k) for i in range(k)] for j in range(k)]
    for j, r in enumerate(result):
        r[j] += 1 - sum(r)
    return np.asarray(result)
def generate_random_start_prob():
    k = 8
    start_prob = [np.random.uniform(0,1) for _ in range(k)]
    start_prob = np.asarray(start_prob)
    start_prob = start_prob/np.sum(start_prob)
    return np.asarray(start_prob)

In [None]:
# CSP Solver
from queue import PriorityQueue
from abc import ABC, abstractmethod


class Constraint(ABC):
    def __init__(self, variables):
        self.variables = variables

    @abstractmethod
    def satisfied(self, assignment):
        pass


class CSP():
    def __init__(self, variables, domains):
        self.variables = variables
        self.domains = domains
        self.constraints = {}
        for variable in self.variables:
            self.constraints[variable] = []
            if variable not in self.domains:
                raise LookupError(
                    'Every variable should have a domain assigned to it.')

    def add_constraint(self, constraint):
        for variable in constraint.variables:
            if variable not in self.variables:
                raise LookupError("Variable in constraint not in CSP")
            else:
                self.constraints[variable].append(constraint)

    def consistent(self, variable, assignment):
        for constraint in self.constraints[variable]:
            if not constraint.satisfied(assignment):
                return False
        return True

    def backtracking_search(self, assignment={}):
        # assignment is complete if every variable is assigned (our base case)
        if len(assignment) == len(self.variables):
            return assignment
        # get all variables in the CSP but not in the assignment
        unassigned = [v for v in self.variables if v not in assignment]
        first = unassigned[0]
        for value in self.domains[first]:
            local_assignment = assignment.copy()
            local_assignment[first] = value
            # if we're still consistent, we recurse (continue)
            if self.consistent(first, local_assignment):
                result = self.backtracking_search(local_assignment)
                if result is not None:
                    return result
        return None


class Constraint(Constraint):
    def __init__(self, place1, place2):
        super().__init__([place1, place2])
        self.place1 = place1
        self.place2 = place2

    def satisfied(self, assignment):
        if self.place1 not in assignment or self.place2 not in assignment:
            return True
        return assignment[self.place1] != assignment[self.place2]

In [None]:
# Trump Tweet Sentiment Analysis
sid = SentimentIntensityAnalyzer()
words = set(nltk.corpus.words.words())
real_donald_trump = pd.read_csv("realdonaldtrump.csv")
trump_tweets = pd.read_csv("trumptweets.csv")
cond1 = real_donald_trump['date'] >= "2017-01-20"
cond2 = real_donald_trump['date'] <= "2020-01-20"
real_donald_trump = real_donald_trump.where((cond1 & cond2)).dropna()
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet)
         if w.lower() in words or not w.isalpha())
    return tweet
    

real_donald_trump['content_clean'] = real_donald_trump['content'].apply(cleaner)
word_dict = {'manipulate':-1,'manipulative':-1,'jamescharlesiscancelled':-1,'jamescharlesisoverparty':-1,
            'pedophile':-1,'pedo':-1,'cancel':-1,'cancelled':-1,'cancel culture':0.4,'teamtati':-1,'teamjames':1,
            'teamjamescharles':1,'liar':-1,'MAGA':-1}


sid = SentimentIntensityAnalyzer()
sid.lexicon.update(word_dict)

list1 = []
for i in real_donald_trump['content_clean']:
    list1.append((sid.polarity_scores(str(i)))['compound'])
real_donald_trump['sentiment'] = list1

In [None]:
# defining a dictionary of states
states_dict = {
    0: 'very-small-rise',
    1: 'small-rise',
    2: 'large-rise',
    3: 'very-large-rise',
    4: 'very-small-drop',
    5: 'small-drop',
    6: 'large-drop',
    7: 'very-large-drop',
}

In [None]:
# Function to train HMM without CSP
def training_hmm_without_csp(training_data):
    start_prob = generate_random_start_prob()
    transition_matrix = generate_random_transition_matrix()
    model = hmm.GMMHMM(n_components=8, n_mix = 7, covariance_type="diag", n_iter=10)
    model.fit(training_data)
    return model

In [None]:
# Function to train HMM with CSP
def training_hmm_with_csp(training_data, tweet_dataset):
    start_prob = generate_random_start_prob()
    transition_matrix = generate_random_transition_matrix()

    variables = [
    "0","1","2","3","4","5","6","7"
    ]
    domains = {
        "0":["0","1"],
        "1":["0","1"],
        "2":["0","1"],
        "3":["0","1"],
        "4":["0","1"],
        "5":["0","1"],
        "6":["0","1"],
        "7":["0","1"],
        "8":["0","1"],
    }
    csp = CSP(variables, domains)

    tweet_sentiment = tweet_dataset["sentiment"].tolist()
    # Adding constraints to our model
    for i in range(len(tweet_sentiment)):
        if(tweet_sentiment[i] >= 0.9): csp.add_constraint(Constraint("4","3"))
        if(tweet_sentiment[i] >= 0.6 and tweet_sentiment[i] < 0.9): csp.add_constraint(Constraint("1","2"))
        if(tweet_sentiment[i] >= 0.3 and tweet_sentiment[i] < 0.1): csp.add_constraint(Constraint("4","5"))
        else: csp.add_constraint(Constraint("6","7"))

    solution = csp.backtracking_search()
    transition_matrix = np.identity(8)
    model = hmm.GMMHMM(n_components=8, n_mix = 7, covariance_type="diag", n_iter=10)
    model.transmat_ = transition_matrix
    model.fit(training_data)

    return model
    

In [None]:
# Predict the index of states (hidden states)
def calculate_index_states(model, X_test):
    X_test = np.asarray(X_test).reshape(-1,1)
    index_states = model.predict(X_test)
    return index_states

In [None]:
# Classify the actual variation of each state (Index)
def classify_state_variation(variations):
    states = []
    for variation in variations:
        if(0 <= variation and variation < 0.1):
            states.append(0)
        elif(0.1 <= variation and variation < 1):
            states.append(1)
        elif(1 <= variation and variation < 2):
            states.append(2)
        elif(variation >= 2):
            states.append(3)
        elif(-0.1 < variation and variation <= 0):
            states.append(4)
        elif(-1 < variation and variation <= -0.1):
            states.append(5)
        elif(-2 < variation and variation <= -1):
            states.append(6)
        elif(variation <= -2):
            states.append(7)
    return states

In [None]:
# # Classify the actual variation of each state (Index)

djia_states = classify_state_variation(djia_variation)

In [None]:
# Choosing best out of 100 models with out CSP
def choose_best_model_without_csp(data):
    X = data
    X_train = X[:int(0.8*len(X))]
    X_test = X[int(0.8*len(X)):]
    X_train = np.asarray(X_train).reshape(-1, 1)
    X_test = np.asarray(X_test).reshape(-1, 1)
    model = training_hmm_without_csp(X_train)
    best_model = model.fit(X_train)
    best_model_score = best_model.score(X_test)
    print("Model 1: Score = " + str(best_model_score))
    for i in range(2,101):
        model = training_hmm_without_csp(X_train)
        print("Model " + str(i) + ": Score = " + str(model.score(X_test)))
        if(model.score(X_test) >= best_model_score):
            best_model_score = model.score(X_test)
            best_model = model

    print("Best Model Found - Score = " + str(best_model_score))
    return best_model, best_model_score

In [None]:
# Choosing best out of 100 models with CSP
def choose_best_model_with_csp(data, tweet_dataset):
    X = data
    X_train = X[:int(0.8*len(X))]
    X_test = X[int(0.8*len(X)):]
    X_train = np.asarray(X_train).reshape(-1, 1)
    X_test = np.asarray(X_test).reshape(-1, 1)
    model = training_hmm_without_csp(X_train)
    best_model = model.fit(X_train)
    best_model_score = best_model.score(X_test)
    print("Model 1: Score = " + str(best_model_score))
    for i in range(2,101):
        model = training_hmm_with_csp(X_train, tweet_dataset)
        print("Model " + str(i) + ": Score = " + str(model.score(X_test)))
        if(model.score(X_test) >= best_model_score):
            best_model_score = model.score(X_test)
            best_model = model

    print("Best Model Found - Score = " + str(best_model_score))
    return best_model, best_model_score

In [None]:
# Calculate best model params and score
djia_dataset_normalized = np.asarray(djia_dataset["DJIA"].tolist())
djia_dataset_normalized = djia_dataset_normalized/sum(djia_dataset_normalized)
djia_dataset_normalized = djia_dataset_normalized.tolist()

best_model_without_csp, best_model_without_csp_score = choose_best_model_without_csp(djia_dataset_normalized)
best_model_with_csp, best_model_with_csp_score = choose_best_model_with_csp(djia_dataset_normalized, real_donald_trump)

In [None]:
# Calculate index of states
index_states_without_csp = calculate_index_states(best_model_without_csp, djia_dataset_normalized[int(0.8*len(djia_dataset_normalized)):])
index_states_with_csp = calculate_index_states(best_model_with_csp, djia_dataset_normalized[int(0.8*len(djia_dataset_normalized)):])

In [None]:
djia_states_testing = djia_states[int(0.8*len(djia_dataset_normalized)):]

In [None]:
# Count correct predictions
def correct_predictions(actual_states, calculated_states):
    count = 0
    for i in range(len(actual_states)):
        if(abs(actual_states[i] - calculated_states[i]) <= 3):
            count += 1
    return count

In [None]:
# Calculate Accuracy
accuracy_without_csp = correct_predictions(djia_states_testing, index_states_without_csp)/len(djia_states_testing)
accuracy_with_csp = correct_predictions(djia_states_testing, index_states_with_csp)/len(djia_states_testing)

In [None]:
accuracy_with_csp

In [None]:
# Calculate MAPE
def calculate_mape(actual_states, calculated_states):
    count = 0
    for i in range(len(actual_states)):
        count = count + abs((actual_states[i] - calculated_states[i])/(actual_states[i]+100))
    return (count/len(calculated_states))*100

In [None]:
calculate_mape(djia_states_testing, index_states_with_csp)

In [None]:
# Predict Prices
def predict_prices(data, index_states):
    X = data
    price_variation = [-3,-2,-1,-0.1,0.1,1,2,3]
    predicted_prices = [X[int(0.8*len(X))-1]]
    for i in range(1, int(0.2*len(X))):
        new_price = predicted_prices[i-1] + predicted_prices[i-1]*price_variation[index_states[i]]/100
        predicted_prices.append(new_price)
    return predicted_prices    

In [None]:
predict_prices_without_csp = predict_prices(djia_dataset_normalized, index_states_without_csp)
predict_prices_with_csp = predict_prices(djia_dataset_normalized, index_states_with_csp)

In [None]:
# State Transition
#fig = plt.figure(figsize=(10,5))
fig, ax = plt.subplots(2)
ax[0].plot(index_states_without_csp, label = 'State transition without CSP')
ax[1].plot(index_states_with_csp,label = 'State transition with CSP', color='green')
ax[0].set_xlabel("Time")
ax[0].set_ylabel("State")
ax[1].set_xlabel("Time")
ax[1].set_ylabel("State")
ax[0].set_title('State transition without and with CSP')

In [None]:
# plot predicted trend vs actual trend
fig = plt.figure(figsize=(10,10))
plt.plot(predict_prices_without_csp, label = 'Predicted Values - Without CSP')
plt.plot(predict_prices_with_csp, label = 'Predicted Values - With CSP')
plt.plot(djia_dataset_normalized[int(0.8*len(djia_dataset_normalized)):], label = 'Actual Trend')
plt.legend(loc = 'upper left')
plt.xlabel("Time")
plt.ylabel("DJIA Index Value")

In [None]:
#import pickle
#with open("bestfitdjiamodeln.pkl", "wb") as f: 
#    pickle.dump(best_model, f)

In [None]:
X = djia_dataset["DJIA"].tolist()
X_train = X[:int(0.8*len(X))]
X_test = X[int(0.8*len(X)):]
X_train = np.asarray(X_train).reshape(-1, 1)
X_test = np.asarray(X_test).reshape(-1, 1)

In [None]:
# Find time complexity
def finding_model_time(X_train, tweet_dataset):
    start_time = time.time()
    model = training_hmm_without_csp(X_train)
    print("--- %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    model = training_hmm_with_csp(X_train, tweet_dataset)
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
finding_model_time(X_train, real_donald_trump)