DATA PRE-PROCESING

In [1]:
#Libs
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [None]:
##nltk ressources
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('omw-1.4')

In [None]:
df_tweets = pd.read_excel('Data/tweets_raw.xlsx')

In [3]:
#initiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
#function to clean the text and lemmatize it (i.e taking the root of the words)
def clean_and_lemmatize(text):
    text = text.lower() 
    text = re.sub(r'\$\w+', '', text) #supress actions/tickers
    text = re.sub(r'http\S+|www.\S+', '', text) #supress url
    text = re.sub(r'[^a-z\s]', '', text) #supress special character and number
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmas)

In [5]:
df_tweets['clean_text'] = df_tweets['text'].apply(clean_and_lemmatize)

In [None]:
#Arranging the df
df_tweets = df_tweets.drop(columns=['text'])
df_tweets = df_tweets[['clean_text', 'label']]
df_tweets.to_csv('Data/tweets_cleaned.csv',index= False)

VECTORIZATION

In [None]:
#Vectorization by TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=2500)
X_tfidf = tfidf_vectorizer.fit_transform(df_tweets['clean_text'])

#dataframe conversion
df_tfidf = pd.DataFrame(X_tfidf.toarray(), 
                        columns=tfidf_vectorizer.get_feature_names_out())

#add label column
df_tfidf['label'] = df_tweets['label'].values

In [23]:
#Vectorization by Bag of Word(BOW)

bow_vectorizer = CountVectorizer(max_features=2500)
X_bow = bow_vectorizer.fit_transform(df_tweets['clean_text'])

#dataframe conversion

df_bow = pd.DataFrame(X_bow.toarray(), 
                      columns=bow_vectorizer.get_feature_names_out())

#add label column
df_bow['label'] = df_tweets['label'].values

In [None]:
#Saving the result as CSV

#TF-IDF
df_tfidf.to_csv('Data/tweets_tfidf.csv', index=False)

#BOW
df_bow.to_csv('Data/tweets_bow.csv', index=False)