In [1]:
import pandas as pd

# Load the two datasets
tweets_df = pd.read_csv('C:/Users/HP/crypto-tweet-analysis/data/Bitcoin_tweets.csv')
market_data = pd.read_csv('C:/Users/HP/crypto-tweet-analysis/data/modified_cryptocurrency_prices.csv')

# Convert the 'date' columns to datetime
tweets_df['date'] = pd.to_datetime(tweets_df['date'])
market_data['date'] = pd.to_datetime(market_data['date'])

# Sort both dataframes by 'date' before using merge_asof
tweets_df = tweets_df.sort_values('date')
market_data = market_data.sort_values('date')

# Merge the datasets based on the 'date'
# You may need to round or align them to a common time frame if necessary
merged_data = pd.merge_asof(tweets_df, market_data, on='date')

# Display the first few rows of the merged data
print(merged_data.head())

      id       date                                               text  \
0      2 2018-03-23  RT @tippereconomy: Another use case for #block...   
1   8167 2018-03-23  @akbar_ohi @YoustockProject This article expla...   
2  38675 2018-03-23  RT @Rubingh: In the world of hype, #blockchain...   
3  28785 2018-03-23  Name: INS Ecosystem\nSymbol: INS\n24 hour chan...   
4   8745 2018-03-23  Crypto Collectibles Are Worthless Without a We...   

       Screen_name                                        Source  \
0     hojachotopur  [u'blockchain', u'Tipper', u'TipperEconomy']   
1      JapaMahatma                                            []   
2        racrozier                               [u'blockchain']   
3  moneyblockchain                                            []   
4   CoinbeagleNews                                            []   

                                                Link     Sentiment  \
0  <a href="http://twitter.com" rel="nofollow">Tw...  ['positive']   
1  <a 

In [3]:
import re

# Function to clean the tweet text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters (punctuation, numbers, etc.)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning function to the 'Tweet' column
tweets_df['cleaned_tweet'] = tweets_df['text'].apply(clean_text)

# View the cleaned data
print(tweets_df[['text', 'cleaned_tweet']].head())

                                                    text  \
0      RT @tippereconomy: Another use case for #block...   
45195  @akbar_ohi @YoustockProject This article expla...   
45196  RT @Rubingh: In the world of hype, #blockchain...   
45197  Name: INS Ecosystem\nSymbol: INS\n24 hour chan...   
45198  Crypto Collectibles Are Worthless Without a We...   

                                           cleaned_tweet  
0      rt another use case for and the can unseat fac...  
45195  this article explains a little about what im d...  
45196  rt in the world of hype ranks highly partly be...  
45197  name ins ecosystem symbol ins hour change pric...  
45198  crypto collectibles are worthless without a we...  


In [5]:
from nltk.tokenize import word_tokenize
import nltk

# Tokenize the cleaned tweets
tweets_df['tokenized_tweet'] = tweets_df['cleaned_tweet'].apply(word_tokenize)

# View the tokenized data
print(tweets_df[['cleaned_tweet', 'tokenized_tweet']].head())


                                           cleaned_tweet  \
0      rt another use case for and the can unseat fac...   
45195  this article explains a little about what im d...   
45196  rt in the world of hype ranks highly partly be...   
45197  name ins ecosystem symbol ins hour change pric...   
45198  crypto collectibles are worthless without a we...   

                                         tokenized_tweet  
0      [rt, another, use, case, for, and, the, can, u...  
45195  [this, article, explains, a, little, about, wh...  
45196  [rt, in, the, world, of, hype, ranks, highly, ...  
45197  [name, ins, ecosystem, symbol, ins, hour, chan...  
45198  [crypto, collectibles, are, worthless, without...  


In [7]:
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to apply lemmatization
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization to the tokenized tweets
tweets_df['lemmatized_tweet'] = tweets_df['tokenized_tweet'].apply(lemmatize_tokens)

# View the lemmatized data
print(tweets_df[['tokenized_tweet', 'lemmatized_tweet']].head())

                                         tokenized_tweet  \
0      [rt, another, use, case, for, and, the, can, u...   
45195  [this, article, explains, a, little, about, wh...   
45196  [rt, in, the, world, of, hype, ranks, highly, ...   
45197  [name, ins, ecosystem, symbol, ins, hour, chan...   
45198  [crypto, collectibles, are, worthless, without...   

                                        lemmatized_tweet  
0      [rt, another, use, case, for, and, the, can, u...  
45195  [this, article, explains, a, little, about, wh...  
45196  [rt, in, the, world, of, hype, rank, highly, p...  
45197  [name, in, ecosystem, symbol, in, hour, change...  
45198  [crypto, collectible, are, worthless, without,...  


In [9]:
from nltk.corpus import stopwords

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Apply stop word removal
tweets_df['final_tokens'] = tweets_df['lemmatized_tweet'].apply(remove_stopwords)

# View the processed data
print(tweets_df[['lemmatized_tweet', 'final_tokens']].head())

                                        lemmatized_tweet  \
0      [rt, another, use, case, for, and, the, can, u...   
45195  [this, article, explains, a, little, about, wh...   
45196  [rt, in, the, world, of, hype, rank, highly, p...   
45197  [name, in, ecosystem, symbol, in, hour, change...   
45198  [crypto, collectible, are, worthless, without,...   

                                            final_tokens  
0      [rt, another, use, case, unseat, facebook, cha...  
45195                    [article, explains, little, im]  
45196  [rt, world, hype, rank, highly, partly, hard, ...  
45197  [name, ecosystem, symbol, hour, change, price,...  
45198  [crypto, collectible, worthless, without, webs...  


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000)  # Set max vocabulary size
tokenizer.fit_on_texts(tweets_df['final_tokens'])

# Convert tokens to sequences
tweets_df['sequences'] = tokenizer.texts_to_sequences(tweets_df['final_tokens'])

# Pad sequences to ensure uniform length (padding at the end)
X = pad_sequences(tweets_df['sequences'], maxlen=100, padding='post')

# View the final sequences
print(X[:5])

[[   1  180   88  147  842  472    8  407   74   48    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [  45  288  199  121    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0

In [13]:
from textblob import TextBlob

# Function to get the sentiment polarity of a tweet
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns polarity between -1 and 1

# Apply the sentiment analysis function to the cleaned tweet column
tweets_df['sentiment'] = tweets_df['cleaned_tweet'].apply(get_sentiment)

# Label tweets as Positive, Negative, or Neutral based on polarity score
def label_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

tweets_df['sentiment_label'] = tweets_df['sentiment'].apply(label_sentiment)

# View the data with sentiment labels
print(tweets_df[['cleaned_tweet', 'sentiment', 'sentiment_label']].head())

                                           cleaned_tweet  sentiment  \
0      rt another use case for and the can unseat fac...   0.136364   
45195  this article explains a little about what im d...  -0.187500   
45196  rt in the world of hype ranks highly partly be...  -0.065833   
45197  name ins ecosystem symbol ins hour change pric...  -0.400000   
45198  crypto collectibles are worthless without a we...  -0.800000   

      sentiment_label  
0            Positive  
45195        Negative  
45196        Negative  
45197        Negative  
45198        Negative  


In [15]:
# Function to get subjectivity
def get_subjectivity(text):
    analysis = TextBlob(text)
    return analysis.sentiment.subjectivity  # Returns subjectivity between 0 (objective) and 1 (subjective)

# Apply subjectivity analysis
tweets_df['subjectivity'] = tweets_df['cleaned_tweet'].apply(get_subjectivity)

# View subjectivity
print(tweets_df[['cleaned_tweet', 'subjectivity']].head())

                                           cleaned_tweet  subjectivity
0      rt another use case for and the can unseat fac...      0.500000
45195  this article explains a little about what im d...      0.500000
45196  rt in the world of hype ranks highly partly be...      0.540833
45197  name ins ecosystem symbol ins hour change pric...      0.825000
45198  crypto collectibles are worthless without a we...      0.900000


In [None]:
import pandas as pd
from transformers import pipeline

# Load the pre-trained emotion detection model using Hugging Face
emotion_classifier = pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base', return_all_scores=True)

# Define batch size for processing
batch_size = 5000  # Adjust the batch size based on your system's capabilities
num_batches = len(tweets_df) // batch_size + 1

# Initialize an empty list to store results
all_emotion_scores = []

# Process the data in batches
for i in range(num_batches):
    batch_texts = tweets_df['cleaned_tweet'].iloc[i*batch_size : (i+1)*batch_size].tolist()
    
    # Apply emotion detection on the batch
    batch_emotions = emotion_classifier(batch_texts)
    
    # Append the results for the batch
    all_emotion_scores.extend(batch_emotions)

# Add the emotion scores to the DataFrame
tweets_df['emotion_scores'] = all_emotion_scores

# Function to extract key emotions
def extract_key_emotions(scores):
    emotions = {}
    for score in scores:
        emotions[score['label']] = score['score']
    return emotions

# Apply the function to get key emotions for each tweet
tweets_df['key_emotions'] = tweets_df['emotion_scores'].apply(extract_key_emotions)

# View the tweet data with emotion scores
print(tweets_df[['cleaned_tweet', 'key_emotions']].head())

# Optionally, save the results to a CSV file
tweets_df.to_csv('tweets_with_emotions_optimized.csv', index=False)


In [None]:
from transformers import pipeline

# Load a pre-trained emotion detection model using Hugging Face
emotion_classifier = pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base', return_all_scores=True)

# Function to get emotion scores
def get_emotions(text):
    scores = emotion_classifier(text)
    return scores[0]  # Return scores for all emotions

# Apply emotion detection
tweets_df['emotion_scores'] = tweets_df['cleaned_tweet'].apply(get_emotions)

# Extract key emotions
def extract_key_emotions(scores):
    emotions = {}
    for score in scores:
        emotions[score['label']] = score['score']
    return emotions

# Apply the function to get key emotions
tweets_df['key_emotions'] = tweets_df['emotion_scores'].apply(extract_key_emotions)

# View the data with emotion scores
print(tweets_df[['cleaned_tweet', 'key_emotions']].head())




In [23]:
# Function to extract the most prominent emotion
def get_top_emotion(emotions):
    return max(emotions, key=emotions.get)

# Apply the function to extract the dominant emotion for each tweet
tweets_df['dominant_emotion'] = tweets_df['key_emotions'].apply(get_top_emotion)

# View the tweet data with dominant emotions
print(tweets_df[['cleaned_tweet', 'dominant_emotion']].head())


                                           cleaned_tweet dominant_emotion
25808  rt big congratulations to the organisers of th...              joy
22468  rt we already have an ios prototype of the bet...          neutral
17938  rt cryptonsleeps icx krw trading market just w...          neutral
15084  rt hi everybody we want to remind you that onl...          neutral
23325                       rt join the gym rewards beta          neutral


In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout

# Assuming max_len is the length of the input sequences and vocab_size is the size of the vocabulary
max_len = 100  # Example input sequence length
vocab_size = 5000  # Example vocabulary size
embedding_dim = 128  # Size of the embedding vectors

# Define the LSTM-GRU model
model = Sequential()

# Add an Embedding layer (input_dim should match the vocabulary size, output_dim is the embedding size)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))

# Add an LSTM layer with 128 units
model.add(LSTM(units=128, return_sequences=True))

# Add a Dropout layer for regularization
model.add(Dropout(0.3))

# Add a GRU layer with 64 units
model.add(GRU(units=64))

# Add a Dropout layer for regularization
model.add(Dropout(0.3))

# Add a Dense layer with softmax activation (for classification)
model.add(Dense(units=3, activation='softmax'))  # Assuming 3 classes for sentiment (positive, neutral, negative)

# Compile the model using Adam optimizer and categorical crossentropy loss (for multi-class classification)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the summary of the model architecture
model.summary()


