In [None]:
!pip install transformers
!pip install xformers
!pip install yahooquery
!pip install yfinance
!pip install yahoofinancials
!pip install tensorflow_addons

In [None]:
import pandas as pd
from numpy import NaN
import spacy
from transformers import pipeline
import re
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Attention, Dropout, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import plot_model
import tensorflow as tf
from transformers import TFRobertaModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from textblob import TextBlob

from yahoofinancials import YahooFinancials
from google.colab import drive

#drive.mount('/content/drive')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('brown')

## **Preprocessing**

In [None]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        # remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # convert to lowercase
        text = text.lower()
        # tokenize text
        tokens = nltk.word_tokenize(text)
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        # lemmatize text
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # join tokens back into text
        text = ' '.join(tokens)
    return text

file_path = 'dataset_50-person-from-2021-02-05_2023-06-12_21-34-17-266.csv'
# Load the Excel file into a DataFrame
df = pd.read_csv(file_path,encoding='ISO-8859-1')

# Remove rows with "na" values
df = df.dropna(subset=['full_text'])

# Fill missing values in 'full_text' column with an empty string
df['full_text'] = df['full_text'].fillna('')
# to lower text
df['full_text'] = df['full_text'].str.lower()
# Preprocess the 'full_text' column
df['clean_text'] = df['full_text'].apply(preprocess_text)

In [None]:
# Filter the DataFrame to keep rows where "created_at" is greater than or equal to 2021-01-01
df = df[df['created_at'] >= '2021-01-01']
import datetime
# Convert the "created_at" column to datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

# Extract the date part from the datetime and convert it to the desired format
df['created_at'] = df['created_at'].dt.strftime("%Y-%m-%d")

#### **add importance_coefficient per tweets**

In [None]:
df['importance_coefficient'] = df['retweet_count'] + 2 * df['favorite_count'] + 0.5 * df['reply_count']
# Find the minimum and maximum values of the importance coefficient
min_value = df['importance_coefficient'].min()
max_value = df['importance_coefficient'].max()

# Normalize the importance coefficient
df['importance_coefficient_normalized'] = (df['importance_coefficient'] - min_value) / (max_value - min_value)
# Sort the DataFrame based on the "created_at" column in ascending order
df = df.sort_values('created_at', ascending=True)

# Print the sorted DataFrame
df.head()