In [None]:
import pandas as pd
import numpy as np
import os
import json
import csv
import re

# Removing noise from the tweet data like non-alphanumeric and hashes

In [None]:

def contains_https_link(tweet):
    return bool(re.search(r'https://', tweet))

def clean_tweet(tweet):
    all_text  = re.sub("#\S*\s", "", tweet)
    all_text  = re.sub("W+", "", all_text)
    all_text  = re.sub("@\S*\s", "", all_text)
    all_text = re.sub(r'https?://\S+', '', all_text)
    all_text = re.sub(r'[^\x00-\x7F]+', '', all_text)
    return all_text

from datetime import datetime

def standard_time(timeST):
    timestamp = datetime.strptime(timeST, "%Y-%m-%dT%H:%M:%S.%fZ")
    hour = timestamp.hour
    minute = timestamp.minute
    second = timestamp.second
    time_numeric = (hour * 3600) + (minute * 60) + second
    return time_numeric

def like_to_label(likes):
    if likes >= 10000:
        return 4
    if likes >= 1000:
        return 3
    if likes >= 100:
        return 2
    if likes >= 10:
        return 1
    else:
        return 0

# Importing Data From File

In [None]:
df = pd.read_csv('../Datasets/tweet_data3.csv')
print(df.columns)

# Clean the tweet column
df['embedded_video'] = df['Tweet Text'].apply(contains_https_link)
df['cleaned_tweet'] = df['Tweet Text'].apply(clean_tweet)
df['converted_time'] = df['Time of Tweet'].apply(standard_time)
df['label'] = df['Like Count'].apply(like_to_label)

# Save the cleaned data to a new CSV file
# df.to_csv('cleaned_dataset.csv', index=False)

Index(['Tweet Text', 'Like Count', 'Followers Count', 'Time of Tweet'], dtype='object')


# Creating Stop Words

In [None]:
import spacy
import nltk

from nltk.corpus import stopwords
stopwords1 = stopwords.words('english')

en = spacy.load("en_core_web_lg")
stopwords2 = en.Defaults.stop_words

stop_words = stopwords1 + list(stopwords2)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Extracting Keywords Using RAKE Algo

In [None]:
from collections import Counter

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def calculate_word_scores(sentences):
    word_freq = Counter()
    word_degree = Counter()
    for sentence in sentences:
        word_list = re.findall(r'\w+', sentence)
        word_list = [word for word in word_list if len(word) > 1]  # Filter out single-character words
        word_freq.update(word_list)
        for word in word_list:
            word_degree[word] += len(word_list) - 1  # Increment the degree by the count of other words in the sentence

    word_scores = Counter()
    for word in word_freq:
        word_scores[word] = word_degree[word] / word_freq[word]
    return word_scores

def calculate_phrase_scores(sentences, word_scores):
    phrase_scores = Counter()
    for sentence in sentences:
        phrase_list = re.findall(r'\w+', sentence)
        phrase_list = [phrase for phrase in phrase_list if len(phrase) > 1]  # Filter out single-word phrases
        phrase_score = sum(word_scores[word] for word in phrase_list)
        phrase_scores[' '.join(phrase_list)] = phrase_score
    return phrase_scores

def extract_keywords(text, num_keywords=5):
    split_pattern = r'[.!?]|(?:\s|^)(?:{})\b'.format('|'.join(map(re.escape, stop_words)))
    text = preprocess_text(text)
    sentences = re.split(split_pattern, text)
    sentences = [sentence for sentence in sentences if sentence.strip()]
    word_scores = calculate_word_scores(sentences)
    phrase_scores = calculate_phrase_scores(sentences, word_scores)
    keywords = phrase_scores.most_common(num_keywords)
    most_words = word_scores.most_common(num_keywords)
    return keywords, most_words

text = "BOMBSHELL Proof COVID Antiviral Pill Molnupirivar By Merck Causes SARS-CoV-2 Mutations"
keywords, words = extract_keywords(text)
print("Top keywords:", keywords)
print("Top Words: ", words)


Top keywords: [('bombshell proof covid antiviral pill molnupirivar', 30.0), ('merck causes sarscov mutations', 12.0)]
Top Words:  [('bombshell', 5.0), ('proof', 5.0), ('covid', 5.0), ('antiviral', 5.0), ('pill', 5.0)]


# Converting Top Keywords to Tokens

In [None]:
def keyword_to_embeddings(top_keywords):
    word_embeddings = []
    for keyword in top_keywords:
        token = en(keyword[0])
        if token.has_vector:
            word_embeddings.append(token.vector)
        else:
            word_embeddings.append(np.zeros(300))

    while len(word_embeddings) < 5:
        word_embeddings.append(np.zeros(300))

    return word_embeddings

#print(keyword_to_embeddings([('bombshell', 5.0), ('proof', 5.0), ('covid', 5.0)]))

In [None]:
keyword_embeddings = []
count = 0
for tweet in df['cleaned_tweet']:
    _,tweet_top_keywords = extract_keywords(tweet)
    embeddings = keyword_to_embeddings(tweet_top_keywords)
    keyword_embeddings.append(embeddings)
    count += 1
    if(count%1000 == 0):
        print(count)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000


In [None]:
print(keyword_embeddings[0])

[array([-1.2047e+00, -1.8841e+00, -4.1402e+00, -3.0751e+00,  1.8830e+00,
       -1.8872e+00,  2.5263e-01,  5.3627e+00, -1.7144e+00, -1.5174e+00,
        7.1173e+00,  1.1453e+00, -3.9868e+00, -8.1233e-01,  1.7725e+00,
        8.0080e-01,  1.1651e+00, -1.9638e+00, -1.6211e+00,  1.8036e+00,
       -1.4354e+00,  1.3000e+00,  8.8791e-01, -1.8825e+00,  4.1976e-01,
        8.8296e-01,  7.6003e-01, -4.9952e-01,  5.6729e-01,  4.0458e-01,
        3.6411e+00, -4.8239e+00, -4.1680e-01, -4.6972e+00,  2.0365e+00,
        1.0195e+00,  1.9232e+00,  1.2530e+00,  5.9611e-01,  9.5434e-01,
       -4.4911e+00, -1.6139e-01, -2.4083e+00, -1.2584e+00, -2.8906e+00,
        2.5896e+00,  2.8413e+00, -3.4578e+00, -1.7886e+00,  5.0010e+00,
        2.6081e+00,  5.1367e+00, -2.5518e+00, -3.3774e+00,  3.2658e-01,
        2.2195e+00,  2.4562e+00,  2.5930e+00,  9.9261e-01,  1.3204e+00,
        3.6505e-01, -5.6079e-03,  3.2209e+00, -2.6231e+00,  4.4062e+00,
        1.8690e+00, -5.4214e+00, -4.1069e+00, -3.8010e-01,  7.8

In [None]:
# Convert the list of embeddings into a NumPy array
keyword_embeddings_array = np.array(keyword_embeddings)
print(keyword_embeddings_array[0])

[[-1.20469999 -1.88409996 -4.14020014 ... -1.68139994 -3.06890011
   0.83428001]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


# Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_time = df['converted_time'].values.reshape(-1, 1)
X_followers = df['Followers Count'].values.reshape(-1,1)
X_video_flag = df['embedded_video'].values.reshape(-1,1)
X_text = keyword_embeddings_array.reshape(18050,1500)
# print(X_text.shape)
# print(X_text[0])
X = np.hstack((X_time, X_followers, X_video_flag, X_text))
# X = df[['cleaned_tweet', 'Followers Count', 'converted_time']]
v = np.array(df['label'])
y = []
for i in v:
    y.append([0, 0, 0, 0, 0])
    y[-1][i] = 1
y = np.array(y)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train[99])
print(y_train[99])

[[1 0 0 0 0]
 [1 0 0 0 0]
 [0 0 0 1 0]
 ...
 [0 0 1 0 0]
 [0 0 1 0 0]
 [0 0 1 0 0]]
[ 51867. 340824.      0. ...      0.      0.      0.]
[0 0 0 1 0]


# Training Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense

# Assuming you have multi-class labels
num_classes = 5  # Number of classes in your dataset

model = Sequential()
model.add(Dense(64, input_dim=1503, activation='relu'))  # Input layer with 1503 dimensions
model.add(Dense(32, activation='relu'))  # Hidden layer
model.add(Dense(num_classes, activation='softmax'))  # Output layer for multi-class classification
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=12000, batch_size=8, validation_split=0.2, verbose=0)

# Evaluate the model on test data
loss, accuracy, precision = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)
print("Loss:", loss)
print("Precision", precision)

Epoch 1/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 5s 4s/step - accuracy: 0.0286 - loss: 6189.642

Epoch 2/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 5s 4s/step - accuracy: 0.2654 - loss: 178.401

Epoch 3/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.3427 - loss: 15.3853

Epoch 4/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.3744 - loss: 13.9610

Epoch 5/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.4753 - loss: 13.6031

Epoch 6/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.5362 - loss: 9.2451

Epoch 7/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.6387 - loss: 6.5475

Epoch 8/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.6548 - loss: 3.4587

Epoch 9/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.7406 - loss: 2.6970

Epoch 10/10[92m ━━━━━━━━━━━━━━━━━━━━[0m 3s 4s/step - accuracy: 0.7102 - loss: 0.6011

Accuracy: 0.7102
Loss: 0.6011
Precision: 0.604
