# Master in Science in Data Analytics for Business
## CA2 - Integrated CA2 

- Student: Wendy Paola Espinoza Potoy
- ID: 2021133

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read data from file specifying header names
tweets = pd.read_csv('ProjectTweets.csv', header=None, names=['ID', 'unknown', 'date', 'flag', 'user', 'text'])
tweets.head()

FileNotFoundError: [Errno 2] No such file or directory: 'ProjectTweets.csv'

In [None]:
tweets.shape


In [None]:
tweets.info()

In [None]:
# Convert the 'date' column to a datetime object without timezone information
tweets['date'] = pd.to_datetime(tweets['date'].str.replace('PDT', ''), errors='coerce')

# Display the DataFrame
tweets.head()

In [None]:
# Remove unnecessary columns
tweets.drop(['ID', 'unknown', 'flag'], axis=1, inplace=True)

In [None]:
tweets.head()

# Data Cleaning and text processing to apply sentimental analyst

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re


In [None]:
def remove_special_characters(text):
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = text.lower()   # Convert to lowercase
    
    return text

In [None]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Join filtered tokens back into text
    text = ' '.join(filtered_tokens)
    return text

In [None]:
#nltk.download('wordnet')
#nltk.download('punkt')

In [None]:
def lemmatized(text):
    
    # Initialize the WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Lemmatize each word and join back into a string
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in words])

    return text

In [None]:
def clean_text(text):
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    text = lemmatized(text)
    text = remove_stopwords(text)
    return text

# Apply clean_text function to 'text' column
tweets['clean_text'] = tweets['text'].apply(clean_text)

In [None]:
tweets.head(10)

## Sentimental Analyst

In [None]:
#pip install textblob vaderSentiment nltk
#pip install textblob nltk

In [None]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
#Function to calculate sentiment using TextBlob

from textblob import TextBlob

# Defining the function to analyze sentiment using TextBlob
def textblob_sentiment(text):
    # Perform sentiment analysis
    blob = TextBlob(text)
    sentiment = blob.sentiment
    
    # Determine sentiment label
    if sentiment.polarity > 0:
        sentiment_label = 'positive'
    elif sentiment.polarity < 0:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'
    
    # Return sentiment label and score
    return sentiment_label, sentiment.polarity

# Apply sentiment analysis to the tweets DataFrame
tweets[['TextBlob sentiment', 'TextBlob score']] = tweets['clean_text'].apply(lambda text: pd.Series(textblob_sentiment(text)))



In [None]:
# Download the VADER lexicon 
#nltk.download('vader_lexicon')

In [None]:
# Instantiate the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to analyze sentiment using VADER
def analyze_sentiment(text):
    # Perform sentiment analysis
    scores = sid.polarity_scores(text)
    
    # Classify the sentiment
    if scores['compound'] >= 0.05:
        sentiment = 'positive'
    elif scores['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    # Return the sentiment label and scores
    return sentiment, scores

# Apply sentiment analysis to the tweets DataFrame
tweets[['VADER sentiment', 'VADER compound', ]] = tweets['text'].apply(lambda text: pd.Series(analyze_sentiment(text)))


In [None]:
tweets.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot TextBlob sentiment
sns.countplot(data=tweets, x='TextBlob sentiment', order=['positive', 'neutral', 'negative'], ax=axes[0])
axes[0].set_title('TextBlob Sentiment Analysis')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')

# Plot VADER sentiment
sns.countplot(data=tweets, x='VADER sentiment', order=['positive', 'neutral', 'negative'], ax=axes[1])
axes[1].set_title('VADER Sentiment Analysis')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Count')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
tweets.head()

In [None]:
# Grouping by date and sentiment
tweets_sentiment_daily = tweets.groupby(tweets['date'].dt.date)['TextBlob sentiment'].value_counts().unstack(fill_value=0).reset_index()

# Ensuring the 'date' column is in datetime format
tweets_sentiment_daily['date'] = pd.to_datetime(tweets_sentiment_daily['date'])

# Display the first 10 rows of the resulting DataFrame
tweets_sentiment_daily.head(10)

In [None]:
tweets_sentiment_daily.shape

In [None]:
# Sum the values for each sentiment category
total_sentiments = tweets_sentiment_daily[['negative', 'neutral', 'positive']].sum()

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))

# Bar plot
total_sentiments.plot(kind='bar', ax=ax, color=['red', 'gray', 'green'])

# Setting labels and title
ax.set_xlabel('Sentiment')
ax.set_ylabel('Total Count')
ax.set_title('Total Counts of Each Sentiment Category')
ax.set_xticklabels(['Negative', 'Neutral', 'Positive'], rotation=0)

# Show plot
plt.show()

In [None]:
# Plot histograms for each sentiment column
tweets_sentiment_daily[['negative', 'neutral', 'positive']].hist(bins=50, figsize=(12, 6), layout=(1, 3))
plt.suptitle('Distribution of Sentiment Counts')
plt.show()


In [None]:
# Plot box plots for each sentiment column
tweets_sentiment_daily[['negative', 'neutral', 'positive']].plot(kind='box', subplots=True, layout=(1, 3), figsize=(12, 6))
plt.suptitle('Box Plot of Sentiment Counts')
plt.show()


In [None]:
# plotting Sentiment Analysis Over Time 

plt.figure(figsize=(12, 6))
width = 0.2  # Width of the bars
dates = tweets_sentiment_daily['date']

# Creating bar positions
x = dates
x1 = [d - pd.Timedelta(days=0.2) for d in dates]
x2 = [d + pd.Timedelta(days=0.2) for d in dates]

plt.bar(x1, tweets_sentiment_daily['negative'], width=width, label='Negative', color='red')
plt.bar(x, tweets_sentiment_daily['neutral'], width=width, label='Neutral', color='blue')
plt.bar(x2, tweets_sentiment_daily['positive'], width=width, label='Positive', color='green')

plt.xticks(dates, rotation=45, fontsize=8)
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Sentiment Analysis Over Time')
plt.grid(True, linewidth=0.2)
plt.tight_layout()
plt.legend()
plt.show()

note: There are gaps in dates to apply time series is required to fill missing dates. 

In [None]:
# Generate date range to fill missing days
date_range = pd.date_range(start='2009-04-06', end='2009-06-25', freq='D')

# Create DataFrame
completed_dates = pd.DataFrame(date_range, columns=['date'])

# Display the size and first few rows
print('Size:', completed_dates.size)
print(completed_dates.head())

In [None]:
#Merging actual data with complete dates
dates_new = pd.merge(completed_dates, tweets_sentiment_daily,on='date', how='left')  
dates_new.head()

In [None]:
dates_new.shape

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Converting to DataFrame
dates_new = pd.DataFrame(data)

# Ensure 'date' column is in the DataFrame and convert to datetime
if 'date' in dates_new.columns:
    dates_new['date'] = pd.to_datetime(dates_new['date'])
else:
    print("Error: 'date' column not found in the DataFrame")

# Mean Imputation
mean_imputed = dates_new.fillna(dates_new.mean(numeric_only=True))


# K-Nearest Neighbors (KNN) Imputation
imputer = KNNImputer(n_neighbors=2)
knn_imputed = dates_new.copy()
knn_imputed[['negative', 'neutral', 'positive']] = imputer.fit_transform(dates_new[['negative', 'neutral', 'positive']])


# Interpolation
interp_imputed = dates_new.copy()
interp_imputed = interp_imputed.interpolate(method='linear', limit_direction='forward')


# Function to compare and determine the best method
def compare_imputations(original, mean_imputed, knn_imputed, interp_imputed):

  
    # In this specific case, interpolation seems to follow the trend of the data better.
    print("Conclusion: Interpolation seems to be the best method for this dataset") 
# Compare the imputed DataFrames
compare_imputations(dates_new, mean_imputed, knn_imputed, interp_imputed)



In [None]:
# Fill NA values with linear interpolation 
dates_new_int = dates_new.interpolate(method='linear', axis=0) 

# rounding to INT values
for col in columns:
    dates_new[col] = round(dates_new[col])

dates_new_int.head()

In [None]:
# Plot before imputation
plt.figure(figsize=(12, 6))
for col in columns:
    plt.plot(dates_new['date'], dates_new[col], label=col)

plt.title('Sentiment Counts Over Time (Before Imputation)')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot after imputation
plt.figure(figsize=(12, 6))
for col in columns:
    plt.plot(dates_new_int['date'], dates_new_int[col], label=col)

plt.title('Sentiment Counts Over Time (After Imputation)')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
#Sentiment Analysis Over Time  with imputations

plt.figure(figsize=(12, 6))
width = 0.2  # Width of the bars
dates = dates_new_int['date']

# Creating bar positions
x = dates
x1 = [d - pd.Timedelta(days=0.2) for d in dates]
x2 = [d + pd.Timedelta(days=0.2) for d in dates]

plt.bar(x1, dates_new_int['negative'], width=width, label='Negative', color='red')
plt.bar(x, dates_new_int['neutral'], width=width, label='Neutral', color='blue')
plt.bar(x2, dates_new_int['positive'], width=width, label='Positive', color='green')

plt.xticks(dates, rotation=45, fontsize=8)
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Sentiment Analysis Over Time with imputations ')
plt.grid(True, linewidth=0.2)
plt.tight_layout()
plt.legend()
plt.show()

# Check time series data 

In [None]:
# Augmented Dickey-Fuller Test for stationarity
from statsmodels.tsa.stattools import adfuller

for col in ['negative', 'neutral', 'positive']:
    result = adfuller(dates_new_int[col])
    print(f'ADF Statistic for {col}: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Stationary: Yes' if result[1] < 0.05 else 'Stationary: No')


Stationarity means that the statistical properties of a time series (such as mean, variance, and autocorrelation) do not change over time

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
import numpy as np

for col in ['negative', 'neutral', 'positive']:
    fig, ax = plt.subplots(figsize=(10, 5))
    
    # Plot autocorrelation
    plot_acf(dates_new_int[col], lags=30, ax=ax)
    plt.title(f'Autocorrelation Plot for {col}')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    
    # Annotate significant correlations
    conf_level = 1.96 / np.sqrt(len(dates_new_int[col]))  # 95% confidence interval
    ax.axhline(y=conf_level, linestyle='--', color='gray')  # Upper confidence interval
    ax.axhline(y=-conf_level, linestyle='--', color='gray')  # Lower confidence interval
    
    # Perform ADF test for stationarity
    result = adfuller(dates_new_int[col])
    adf_stat = result[0]
    p_value = result[1]
    critical_values = result[4]
    
    # Determine if the series is stationary
    is_stationary = p_value < 0.05
    
    # Add text annotation for ADF test result at the bottom of the plot
    adf_text = f'ADF Statistic: {adf_stat:.2f}\n' \
               f'p-value: {p_value:.2f}\n' \
               f'Critical Values: {critical_values}\n' \
               f'Stationary: {"Yes" if is_stationary else "No"}'
    
    ax.text(0.5, -0.25, adf_text, transform=ax.transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.5),
            verticalalignment='top', horizontalalignment='center')

    plt.tight_layout()
    plt.show()

    

In [None]:
#Seasonal Decomposition:
from statsmodels.tsa.seasonal import seasonal_decompose

# Decompose the time series for each sentiment category
for col in ['negative', 'neutral', 'positive']:
    decomposition = seasonal_decompose(dates_new_int[col], model='additive', period=7)  # Assuming weekly seasonality
    decomposition.plot()
    plt.suptitle(f'Seasonal Decomposition of {col.capitalize()} Sentiment')
    plt.show()


# Modelling

In [None]:
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
# Convert 'date' column to datetime
dates_new_int['date'] = pd.to_datetime(dates_new_int['date'])


In [None]:
# Extract only the sentiment columns for modeling
data = dates_new_int[['negative', 'neutral', 'positive']].values

# Normalize the data to a small range 
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Split the data into training and testing sets (70% training, 30% testing)
train_size = int(len(scaled_data) * 0.7)
train_data = scaled_data[:train_size]
test_data = scaled_data[train_size:]

# Applying Sarima

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import itertools

# Define the hyperparameters to test
p = [1, 2]  # Autoregressive order
d = [1]     # Differencing
q = [1, 2]  # Moving average order
P = [1]     # Seasonal autoregressive order
D = [1]     # Seasonal differencing
Q = [1]     # Seasonal moving average order
s = [12]    # Seasonal period

# Generate all possible combinations of hyperparameters
hyperparameters_grid = list(itertools.product(p, d, q, P, D, Q, s))

# Dictionary to store SARIMA models and their corresponding AIC values
sarima_models_aic = {}

# Loop through each combination of hyperparameters
for hyperparameters in hyperparameters_grid:
    # Fit SARIMA model with current hyperparameters to each sentiment category
    for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
        model_sarima = SARIMAX(train_data[:, i], order=(hyperparameters[0], hyperparameters[1], hyperparameters[2]), 
                               seasonal_order=(hyperparameters[3], hyperparameters[4], hyperparameters[5], hyperparameters[6]), 
                               initialization='approximate_diffuse')
        results_sarima = model_sarima.fit(enforce_invertibility=True)
        sarima_models_aic[(sentiment, hyperparameters)] = results_sarima.aic

# Find the hyperparameters with the lowest AIC for each sentiment category
best_hyperparameters = {}
for sentiment in ['negative', 'neutral', 'positive']:
    min_aic = min(sarima_models_aic[(sentiment, hyperparameters)] for hyperparameters in hyperparameters_grid)
    best_hyperparameters[sentiment] = [hyperparameters for hyperparameters in hyperparameters_grid 
                                       if sarima_models_aic[(sentiment, hyperparameters)] == min_aic]

# Print the best hyperparameters for each sentiment category
for sentiment in ['negative', 'neutral', 'positive']:
    print(f"Best hyperparameters for {sentiment}: {best_hyperparameters[sentiment]}")


In [None]:
#working with time series data adding sentiments and date and the scaling data

columns = ['negative', 'neutral', 'positive']
dates = pd.date_range(start='2009-04-06', periods=len(scaled_data), freq='D') 
data = pd.DataFrame(scaled_data, columns=columns, index=dates)


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Best hyperparameters for each sentiment category
best_hyperparameters = {
    'negative': {'order': (2, 1, 1), 'seasonal_order': (1, 1, 1, 12)},
    'neutral': {'order': (1, 1, 1), 'seasonal_order': (1, 1, 1, 12)},
    'positive': {'order': (1, 1, 1), 'seasonal_order': (1, 1, 1, 12)}
}

# Dictionary to store Sarima models
sarima_models = {}

# Fitting a SARIMA model to each sentiment category with best parameters
for sentiment, params in best_hyperparameters.items():
    model_sarima = SARIMAX(train_data[:, i], order=params['order'], seasonal_order=params['seasonal_order'],
                           initialization='approximate_diffuse')
    results_sarima = model_sarima.fit(enforce_invertibility=True)
    sarima_models[sentiment] = results_sarima  # Store the fitted model
    print(f"Summary of SARIMA model for {sentiment}:")
    print(results_sarima.summary())
    print("\n")


In [None]:
# Function to generate forecasts for each sentiment
def forecast_sarima(models, steps):
    forecasted = {}
    for sentiment in models:
        forecast = models[sentiment].get_forecast(steps=steps)
        forecasted[sentiment] = forecast.predicted_mean
    return pd.DataFrame(forecasted)

# Forecast 1, 3, and 7 days ahead
forecast_1day_sarima = forecast_sarima(sarima_models, 1)
forecast_3days_sarima = forecast_sarima(sarima_models, 3)
forecast_7days_sarima = forecast_sarima(sarima_models, 7)

print("Forecast for 1 day ahead:")
print(forecast_1day_sarima)
print("\nForecast for 3 days ahead:")
print(forecast_3days_sarima)
print("\nForecast for 7 days ahead:")
print(forecast_7days_sarima)


In [None]:
# Combine forecasts into DataFrame
forecast_1day_array = forecast_1day_sarima.values
forecast_3days_array = forecast_3days_sarima.values
forecast_7days_array = forecast_7days_sarima.values


# Inverse transform the forecasted values
forecast_1day_sarima_rescaled = scaler.inverse_transform(forecast_1day_array)
forecast_3days_sarima_rescaled = scaler.inverse_transform(forecast_3days_array)
forecast_7days_sarima_rescaled = scaler.inverse_transform(forecast_7days_array)

#printing results

print("Forecast for 1 day ahead:")
print(forecast_1day_sarima_rescaled)
print("\nForecast for 3 days ahead:")
print(forecast_3days_sarima_rescaled)
print("\nForecast for 7 days ahead:")
print(forecast_7days_sarima_rescaled)



In [None]:
# Plotting the results in separate subplots

fig, axs = plt.subplots(3, 1, figsize=(12, 18), sharex=True)

# Define color palette for each sentiment
sentiment_colors = {
    'negative': '#6baed6',  # Blue
    'neutral': '#fd8d3c',   # Orange
    'positive': '#31a354'   # Green
}

# Plot actual values
for sentiment, color in sentiment_colors.items():
    for ax in axs:
        ax.plot(dates_new_int['date'], dates_new_int[sentiment], label=f'Actual ({sentiment})', color=color)

# Prepare dates for forecast periods
last_date = dates_new_int['date'].iloc[-1]
forecast_dates_1day = [last_date + pd.DateOffset(days=1)]# Generate forecast dates for 1 days
forecast_dates_3days = [last_date + pd.DateOffset(days=i) for i in range(1, 4)]# Generate forecast dates for 3 days
forecast_dates_7days = [last_date + pd.DateOffset(days=i) for i in range(1, 8)]# Generate forecast dates for 7 days

# Plot forecasted values
for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
    axs[0].plot(forecast_dates_1day, forecast_1day_sarima_rescaled[:, i], marker='o', markersize=8, label=f'1 Day Forecast ({sentiment})', color=sentiment_colors[sentiment])
    axs[1].plot(forecast_dates_3days, forecast_3days_sarima_rescaled[:, i], marker='o', markersize=8, label=f'3 Days Forecast ({sentiment})', color=sentiment_colors[sentiment])
    axs[2].plot(forecast_dates_7days, forecast_7days_sarima_rescaled[:, i], marker='o', markersize=8, label=f'7 Days Forecast ({sentiment})', color=sentiment_colors[sentiment])

# Set titles for subplots
axs[0].set_title('1 Day Forecasting Results')
axs[1].set_title('3 Days Forecasting Results')
axs[2].set_title('7 Days Forecasting Results')

# Set common labels
for ax in axs:
    ax.set_xlabel('Date')
    ax.set_ylabel('Value')
    ax.legend()
    ax.grid(True)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


# Evaluation of the model

In [None]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Define the number of folds for cross-validation
n_splits = 5  # Adjust as needed

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize dictionaries to store performance metrics
performance_metrics = {'MAE': {}, 'MSE': {}}

# Perform cross-validation for each sentiment category
for sentiment in best_hyperparameters:
    # Initialize lists to store performance metrics for each fold
    performance_metrics['MAE'][sentiment] = []
    performance_metrics['MSE'][sentiment] = []
    
    # Extract sentiment data
    data = train_data[:, i]  # Assuming train_data is the dataset
    
    # Perform cross-validation
    for train_index, test_index in tscv.split(data):
        # Split data into training and testing sets for the current fold
        train_data_fold = data[train_index]
        test_data_fold = data[test_index]
        
        # Fit SARIMA model to training data for the current fold
        model_sarima = SARIMAX(train_data_fold, order=best_hyperparameters[sentiment]['order'], 
                               seasonal_order=best_hyperparameters[sentiment]['seasonal_order'],
                               initialization='approximate_diffuse')
        results_sarima = model_sarima.fit(enforce_invertibility=True)
        
        # Forecast sentiment values for the testing set
        forecast = results_sarima.forecast(steps=len(test_data_fold))
        
        # Calculate forecast accuracy metrics for the current fold
        mae = np.mean(np.abs(forecast - test_data_fold))
        mse = np.mean((forecast - test_data_fold)**2)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((test_data_fold - forecast) / test_data_fold)) * 100
        
        # Append performance metrics to lists
        performance_metrics['MAE'][sentiment].append(mae)
        performance_metrics['MSE'][sentiment].append(mse)
       
    

# Compute average performance metrics across all folds
avg_performance_metrics = {metric: {sentiment: np.mean(scores) for sentiment, scores in metrics.items()} 
                           for metric, metrics in performance_metrics.items()}

# Print average performance metrics
for metric, scores in avg_performance_metrics.items():
    print(f"Average {metric} Scores:")
    for sentiment, score in scores.items():
        print(f"{sentiment}: {score:.2f}")
    print()


# Applying LTSM Model

In [None]:
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# Define function to create sequences and labels for LSTM
def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        labels.append(data[i+seq_length])
    return np.array(sequences), np.array(labels)

# Define sequence length (number of time steps to look back)
seq_length = 7  

# Create sequences and labels for training
X_train, y_train = create_sequences(train_data, seq_length)

# Define a function to create the LSTM model
def create_model(batch_size, epochs, learning_rate, num_lstm_units):
    model = Sequential([
        LSTM(num_lstm_units, input_shape=(seq_length, 3)),
        Dense(3)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse', metrics=['accuracy'])
    return model

# Define the parameter grid
param_grid = {
    'batch_size': [32, 64, 128],
    'epochs': [50, 100, 150],
    'learning_rate': [0.01, 0.001, 0.0001],
    'num_lstm_units': [32, 64, 128]
}

# Perform Grid Search
best_accuracy = 0
best_params = None
for batch_size in param_grid['batch_size']:
    for epochs in param_grid['epochs']:
        for learning_rate in param_grid['learning_rate']:
            for num_lstm_units in param_grid['num_lstm_units']:
                model = create_model(batch_size, epochs, learning_rate, num_lstm_units)
                model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
                # Evaluate the model on validation set (if available)
                # val_accuracy = model.evaluate(X_val, y_val)[1]  # Assuming you have validation data
                # Uncomment the above line and replace X_val, y_val with your validation data
                # if you have a validation set to evaluate the model performance
                # For demonstration purpose, let's just use training accuracy as a proxy
                val_accuracy = model.evaluate(X_train, y_train)[1]
                print(f'Parameters: batch_size={batch_size}, epochs={epochs}, learning_rate={learning_rate}, num_lstm_units={num_lstm_units}, Accuracy: {val_accuracy}')
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    best_params = {'batch_size': batch_size, 'epochs': epochs, 'learning_rate': learning_rate, 'num_lstm_units': num_lstm_units}


                    
                    
print("Best Parameters:", best_params)




Parameters: batch_size=128, epochs=150, learning_rate=0.001, num_lstm_units=128, Accuracy: 0.4693877696990967
Parameters: batch_size=128, epochs=150, learning_rate=0.0001, num_lstm_units=32, Accuracy: 0.3877550959587097
Parameters: batch_size=128, epochs=150, learning_rate=0.0001, num_lstm_units=64, Accuracy: 0.3877550959587097
Parameters: batch_size=128, epochs=150, learning_rate=0.0001, num_lstm_units=128, Accuracy: 0.3265306055545807
Best Parameters: {'batch_size': 64, 'epochs': 100, 'learning_rate': 0.01, 'num_lstm_units': 128}


In [None]:
# Define function to create sequences and labels for LSTM
def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        labels.append(data[i+seq_length])
    return np.array(sequences), np.array(labels)

# Define sequence length (number of time steps to look back)
seq_length = 7  # Example sequence length, tune as needed

# Create sequences and labels for training
X_train, y_train = create_sequences(train_data, seq_length)

# Define the best parameters obtained from grid search
best_params = {'batch_size': 64, 'epochs': 100, 'learning_rate': 0.01, 'num_lstm_units': 128}

# Build an LSTM model with the best parameters
model_lstm = Sequential([
    LSTM(best_params['num_lstm_units'], input_shape=(seq_length, 3)),
    Dense(3)
])

# Compile the model
model_lstm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate']), loss='mse', metrics=['accuracy', "mse", "mae"])

# Train the model
history = model_lstm.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'])


# Evaluation of the Model

In [None]:
# Extract MAE and MSE values from history
mae_values = history.history['mae']
mse_values = history.history['mse']

# Calculate average MAE and MSE
avg_mae = np.mean(mae_values)
avg_mse = np.mean(mse_values)

# Find the epoch with the best accuracy
best_epoch = np.argmax(history.history['accuracy']) + 1
best_accuracy = history.history['accuracy'][best_epoch - 1]

print(f'Average MAE: {avg_mae}')
print(f'Average MSE: {avg_mse}')
print(f'Best Accuracy: {best_accuracy} at epoch {best_epoch}' )



In [None]:
# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot training loss
ax1.plot(history.history['loss'], label='Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss Over Epochs')
ax1.legend()

# Plot training accuracy
ax2.plot(history.history['accuracy'], label='Training Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training Accuracy Over Epochs')
ax2.legend()

# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Generate forecasts for 1 day, 3 days, and 7 days
forecast_1day = model_lstm.predict(np.expand_dims(X_train[-1], axis=0))
forecast_3days = model_lstm.predict(X_train[-3:])
forecast_7days = model_lstm.predict(X_train[-7:])

# Print forecasts
print("Forecast for 1 day ahead:")
print(forecast_1day)
print("\nForecast for 3 days ahead:")
print(forecast_3days)
print("\nForecast for 7 days ahead:")
print(forecast_7days)


In [None]:
# Function to forecast future values
def forecast(model, data, seq_length, days_ahead):
    forecasted = []
    current_seq = data[-seq_length:]  # Start with the last available sequence
    for _ in range(days_ahead):
        pred = model.predict(current_seq[np.newaxis, :, :]).flatten()
        forecasted.append(pred)
        current_seq = np.append(current_seq[1:], [pred], axis=0)
    return np.array(forecasted)

In [None]:
# Rescalind the forecasted values back to original scale
forecast_1day_rescaled = scaler.inverse_transform(forecast_1day)
forecast_3days_rescaled = scaler.inverse_transform(forecast_3days)
forecast_7days_rescaled = scaler.inverse_transform(forecast_7days)
print("Forecast for 1 day ahead:")
print(forecast_1day_rescaled)
print("\nForecast for 3 days ahead:")
print(forecast_3days_rescaled)
print("\nForecast for 7 days ahead:")
print(forecast_7days_rescaled)


In [None]:
# Plotting the results
plt.figure(figsize=(12, 6))

# Plot actual values
plt.plot(dates_new_int['date'], dates_new_int[['negative', 'neutral', 'positive']], label='Actual')

# Prepare dates for forecast periods
last_date = dates_new_int['date'].iloc[-1]
forecast_dates_1day = [last_date + pd.DateOffset(days=1)]
forecast_dates_3days = [last_date + pd.DateOffset(days=i) for i in range(1, 4)]
forecast_dates_7days = [last_date + pd.DateOffset(days=i) for i in range(1, 8)]

# Plot forecasted values
for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
    plt.plot(forecast_dates_1day, forecast_1day_rescaled[:, i], marker='o', markersize=8, label=f'1 Day Forecast ({sentiment})')
    plt.plot(forecast_dates_3days, forecast_3days_rescaled[:, i], marker='o', markersize=8, label=f'3 Days Forecast ({sentiment})')
    plt.plot(forecast_dates_7days, forecast_7days_rescaled[:, i], marker='o', markersize=8, label=f'7 Days Forecast ({sentiment})')

plt.title('Forecasting Results')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plotting the results in separate subplots
fig, axs = plt.subplots(3, 1, figsize=(12, 18), sharex=True)

# Plot actual values
# Define color palette for each sentiment
sentiment_colors = {
    'negative': '#6baed6',  # Blue
    'neutral': '#fd8d3c',   # Orange
    'positive': '#31a354'   # Green
}

# Plot actual values
for sentiment, color in sentiment_colors.items():
    for ax in axs:
        ax.plot(dates_new_int['date'], dates_new_int[sentiment], label=f'Actual ({sentiment})', color=color)

# Prepare dates for forecast periods
last_date = dates_new_int['date'].iloc[-1]
forecast_dates_1day = [last_date + pd.DateOffset(days=1)]# Generate forecast dates for 1 days
forecast_dates_3days = [last_date + pd.DateOffset(days=i) for i in range(1, 4)]# Generate forecast dates for 3 days
forecast_dates_7days = [last_date + pd.DateOffset(days=i) for i in range(1, 8)]# Generate forecast dates for 7 days

# Plot forecasted values for 1 day ahead
for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
    axs[0].plot(forecast_dates_1day, forecast_1day_rescaled[:, i], marker='o', markersize=8, label=f'1 Day Forecast ({sentiment})', color=sentiment_colors[sentiment])

# Plot forecasted values for 3 days ahead
for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
    axs[1].plot(forecast_dates_3days, forecast_3days_rescaled[:, i], marker='o', markersize=8, label=f'3 Days Forecast ({sentiment})', color=sentiment_colors[sentiment])

# Plot forecasted values for 7 days ahead
for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
    axs[2].plot(forecast_dates_7days, forecast_7days_rescaled[:, i], marker='o', markersize=8, label=f'7 Days Forecast ({sentiment})', color=sentiment_colors[sentiment])

# Setting titles and labels for subplots
axs[0].set_title('1 Day Forecasting Results')
axs[1].set_title('3 Days Forecasting Results')
axs[2].set_title('7 Days Forecasting Results')

# Setting x-axis limits to include the forecast period
axs[0].set_xlim(dates_new_int['date'].iloc[0], forecast_dates_1day[-1])
axs[1].set_xlim(dates_new_int['date'].iloc[0], forecast_dates_3days[-1])
axs[2].set_xlim(dates_new_int['date'].iloc[0], forecast_dates_7days[-1])

# Setting common labels
for ax in axs:
    ax.set_ylabel('Value')
    ax.legend()
    ax.grid(True)
    ax.label_outer()  # Hide x labels and tick labels for top plots and y ticks for right plots.

axs[2].set_xlabel('Date')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Dashboard


In [None]:
import dash
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_core_components as dcc
import plotly.express as px
from dash.dependencies import Input, Output
import pandas as pd
import plotly.tools as tls
import plotly.graph_objects as go


In [None]:
df = tweets
# Count the occurrences of each TextBlob sentiment
blob_sentiment_counts = df['TextBlob sentiment'].value_counts().reset_index()
blob_sentiment_counts.columns = ['TextBlob sentiment', 'count']


vader_sentiment_counts = df['VADER sentiment'].value_counts().reset_index()
vader_sentiment_counts.columns = ['VADER sentiment', 'count']

df.head()


In [None]:
blob_sentiment_counts.head()

In [None]:
vader_sentiment_counts.head()

In [None]:
df1 = dates_new
df1.head()

In [None]:
df2 = dates_new_int
df2.head()

In [None]:
# Melt the DataFrame to long format
df1_melted = df1.melt(id_vars=["date"], var_name="sentiment", value_name="count")
df1_melted.head()

In [None]:
# Melt the DataFrame to long format
df2_melted = df2.melt(id_vars=["date"], var_name="sentiment", value_name="count")
df2_melted.head()

In [None]:
# Create a Dash application
app = dash.Dash(__name__)

# Define color mapping for sentiments
sentiment_colors = {"negative": "#EF553B", "neutral": "#636EFA", "positive": "#00CC96"}


# Define tab content
tab1_content = html.Div([
    html.H2("LSTM Model"),
    dcc.Dropdown(
        id='dropdown-tab1',
        options=[
            {'label': '1 Day Forecasting', 'value': 'opt1'},
            {'label': '3 Days Forecasting', 'value': 'opt2'},
            {'label': '7 Days Forecasting', 'value': 'opt3'}
        ],
        value='opt1'
    ),
    html.Div(id='output-tab1')
])

tab2_content = html.Div([
    html.H2("Sarima Model"),
    dcc.Dropdown(
        id='dropdown-tab2',
        options=[
           {'label': '1 Day Forecasting', 'value': 'opt4'},
           {'label': '3 Days Forecasting', 'value': 'opt5'},
           {'label': '7 Days Forecasting', 'value': 'opt6'}
        ],
        value='opt4'
    ),
    html.Div(id='output-tab2')
])


# Function to create forecast subplots
def create_forecast_subplot(forecast_dates, forecast_data, title):
    fig = go.Figure()
    
    # Plot original data
    for sentiment in ['negative', 'neutral', 'positive']:
        fig.add_trace(go.Scatter(x=df2_melted["date"], 
                                 y=df2_melted[df2_melted['sentiment'] == sentiment]["count"],
                                 mode='lines', 
                                 name=f'Original Data ({sentiment})',
                                 line=dict(color=sentiment_colors[sentiment])))
    
    # Plot forecasted data
    for i, sentiment in enumerate(['negative', 'neutral', 'positive']):
        fig.add_trace(go.Scatter(x=forecast_dates, 
                                 y=forecast_data[:, i], 
                                 mode='markers+lines', 
                                 name=f'{title} Forecast ({sentiment})',
                                 line=dict(color=sentiment_colors[sentiment]),
                                 marker=dict(color=sentiment_colors[sentiment])))
    
    # Update layout
    fig.update_layout(title=title,
                      xaxis_title="Date",
                      yaxis_title="Count",
                      legend_title="Sentiment",
                      template="plotly_white")
    return fig




#######Page1######## Graphs
# Create a Plotly bar graph for TextBlob
fig_blob = px.bar(
    blob_sentiment_counts,
    x='TextBlob sentiment',
    y='count',
    title='TextBlob Sentiment Counts',
    color='TextBlob sentiment',
    color_discrete_map=sentiment_colors
)

# Create a Plotly bar graph for Vader
fig_vader = px.bar(
    vader_sentiment_counts,
    x='VADER sentiment',
    y='count',
    title='Vader Sentiment Counts',
    color='VADER sentiment',
    color_discrete_map=sentiment_colors
)

#####################
#######Page2######## Graphs
# Create a Plotly bar graph for TextBlob

fig_counts_over_time = px.line(df1_melted, x="date", y="count", color="sentiment", title="Sentiment Analysis over Time Before Imputation",
              labels={"date": "Date", "count": "Count", "sentiment": "Sentiment"},
              color_discrete_map=sentiment_colors)


fig_counts_over_time_with_imputation = px.line(df2_melted, x="date", y="count", color="sentiment", title="Sentiment Analysis over Time After Imputation",
              labels={"date": "Date", "count": "Count", "sentiment": "Sentiment"},
              color_discrete_map=sentiment_colors)

#####################
#######Page3######## Graphs
# Create forecast graphs for each period for LSTM
fig_forecast_1day = create_forecast_subplot(forecast_dates_1day, forecast_1day_rescaled, "1 Day")
fig_forecast_3days = create_forecast_subplot(forecast_dates_3days, forecast_3days_rescaled, "3 Days")
fig_forecast_7days = create_forecast_subplot(forecast_dates_7days, forecast_7days_rescaled, "7 Days")


# Create forecast graphs for each period for SARIMA
fig_forecast_1day_sarima = create_forecast_subplot(forecast_dates_1day, forecast_1day_sarima_rescaled , "1 Day")
fig_forecast_3days_sarima = create_forecast_subplot(forecast_dates_3days, forecast_3days_sarima_rescaled , "3 Days")
fig_forecast_7days_sarima = create_forecast_subplot(forecast_dates_7days, forecast_7days_sarima_rescaled , "7 Days")



###########################################
# Create a Dash application with Bootstrap

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP], suppress_callback_exceptions=True)


# Define the sidebar layout
sidebar = html.Div(
    [
        dbc.NavLink(html.Img(src='https://www.cct.ie/wp-content/uploads/CCT_Logo_New_Aug_17-578x200.png', style={'width': '230px', 'height': 'auto', 'margin': '0 0 0 -15px'}), href="/sentiment-analysis", id="home-link", className="nav-link"),  # Link to sentiment analysis page
        html.Hr(),
        dbc.Nav(
            [
                dbc.NavLink("Sentiment Analysis", href="/sentiment-analysis", id="page-1-link", active="exact"),
                dbc.NavLink("Sentiment Time Series", href="/time-series", id="page-2-link", active="exact"),
                dbc.NavLink("Forecast Models", href="/forecast-models", id="page-3-link", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),
        html.Footer(
            [
                html.P("Master in Data Analytics - CA2 2024", style={"text-align": "center", "margin-bottom": "10px"}),
                html.P("Wendy Paola Espinoza Potoy", style={"text-align": "center", "font-size": "10px", "margin": "0"}),
                html.P("Student ID: 2021133", style={"text-align": "center", "font-size": "10px", "margin": "0"})
            ],
            style={"background-color": "#f8f9fa", "padding": "10px", "margin-top": "auto"}
        )
    ],
    style={"display": "flex", "flex-direction": "column", "min-height": "100vh", "position": "fixed", "top": 0, "left": 0, "bottom": 0, "width": "14rem", "padding": "0.5rem 0.5rem", 'background-color': '#f8f9fa', 'box-shadow': '2px 0 5px rgba(0, 0, 0, 0.1)'},
)

# Define the content layout
content = html.Div(id="page-content", style={"margin-left": "16rem", "padding": "1rem"})

# Define the callback to update the page content based on the URL
@app.callback(
    Output("page-content", "children"),
    [Input("url", "pathname")]
)
def render_page_content(pathname):
    if pathname == "/sentiment-analysis":
        return [
            html.H2("Sentiment Analysis"),
            html.Div("Visualization of sentiment values in the dataset."),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='sentiment-bar-chart-1', figure=fig_blob), width=6),
                    dbc.Col(dcc.Graph(id='sentiment-bar-chart-2', figure=fig_vader), width=6)
                ]
            )
        ]
    elif pathname == "/time-series":
        return [
            html.H2("Sentiment Count over time Analysis"),
            html.Div("Visualization of sentiment values in the dataset."),
            dbc.Col(dcc.Graph(id='sentiment-bar-chart-1', figure=fig_counts_over_time)),
            dbc.Col(dcc.Graph(id='sentiment-bar-chart-2', figure=fig_counts_over_time_with_imputation)),
        ]
    elif pathname == "/forecast-models":
        return [ 
            dcc.Tabs([
                dcc.Tab(label='LSTM Model', children=tab1_content),
                dcc.Tab(label='Sarima Model', children=tab2_content),
            ])
        ]

    # If the URL is invalid, return a 404 error
    return html.H1("404 - Not Found")



# Define callbacks
@app.callback(
    Output('output-tab1', 'children'),
    [Input('dropdown-tab1', 'value')]
)

def update_output_tab1(selected_value):
    if selected_value == 'opt1':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='1day_forecast', figure=fig_forecast_1day))
                ]
            )
        ]
    elif selected_value == 'opt2':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='3day_forecast', figure=fig_forecast_3days))
                ]
            )
        ]
    elif selected_value == 'opt3':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='7days_forecast', figure=fig_forecast_7days))
                ]
            )
        ]
    else:
        return None

@app.callback(
    Output('output-tab2', 'children'),
    [Input('dropdown-tab2', 'value')]
)
def update_output_tab2(selected_value):
    if selected_value == 'opt4':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='1day_forecast_sarima', figure=fig_forecast_1day_sarima))
                ]
            )
        ]
    elif selected_value == 'opt5':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='1day_forecast', figure=fig_forecast_3days_sarima))
                ]
            )
        ]
    elif selected_value == 'opt6':
        return [
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='1day_forecast', figure=fig_forecast_7days_sarima))
                ]
            )
        ]
    else:
        return None
    
    
    
# Define the layout of the app
app.layout = html.Div([
    dcc.Location(id="url", refresh=False, pathname="/sentiment-analysis"),  # Set default URL
    sidebar,
    content
], id="wrapper")

# Run the app
if __name__ == '__main__':
    #app.run(debug=False, host='127.0.0.1', port=8052)
    app.run_server(debug=True, host='127.0.0.1', port=8051)