# Step 1: Importing Required Libraries


In [None]:

# Basic libraries for data manipulation
import numpy as np
import pandas as pd

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Machine Learning and Deep Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Step 2: Load the Dataset


In [None]:

# Load the dataset (assuming 'Bitcoin_tweets.csv' is available in the working directory)
df = pd.read_csv('/path/to/Bitcoin_tweets.csv')
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values if any
df.dropna(inplace=True)


# Step 3: Data Preprocessing (Tokenization, Lemmatization, Stop Words Removal)


In [None]:

# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and lemmatize
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    
    return ' '.join(filtered_tokens)

# Apply the preprocessing to the tweet column
df['cleaned_text'] = df['tweet'].apply(preprocess_text)

print(df['cleaned_text'].head())


# Step 4: Tokenization and Padding


In [None]:

# Tokenize the words and convert them to sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])

X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, maxlen=100)  # Padding sequences to have the same length

# Encode the target variable (emotion or sentiment column)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['dominant_emotion'])

# Convert labels to categorical (one-hot encoding)
y = pd.get_dummies(df['dominant_emotion']).values


# Step 5: Train-Test Split


In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 6: Build the LSTM-GRU Hybrid Model


In [None]:

# Build the LSTM-GRU model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=5000, output_dim=200, input_length=100))

# LSTM layer
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.5))

# GRU layer
model.add(GRU(units=64))

# Fully connected layer
model.add(Dense(16, activation='relu'))

# Output layer (using softmax for multi-class classification)
model.add(Dense(y.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())


# Step 7: Model Training


In [None]:

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


# Step 8: Evaluate the Model


In [None]:

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Plot accuracy and loss curves
plt.figure(figsize=(12, 6))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


# Step 9: Save the Model


In [None]:

# Save the trained model
model.save('lstm_gru_cryptocurrency_tweet_model.h5')


# Step 10: Predict and Visualize Results


In [None]:

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_true, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Classification Report
print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))
