In [None]:
import pandas as pd
import re
import string
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.stats import uniform, randint

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
# Load the dataset
issuers = pd.read_csv(Path.cwd().parent / 'data' / 'tweets.csv')

In [None]:
# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
tweets_data['cleaned_text'] = tweets_data['text'].apply(preprocess_text)
tweets_data['cleaned_text'].fillna('', inplace=True)

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(tweets_data['cleaned_text']).toarray()

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tweets_data['target'])

In [None]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# Naive Bayes with RandomizedSearchCV
nb_tfidf = MultinomialNB()
nb_params = {'alpha': uniform(0.1, 1.0)}
random_search_nb = RandomizedSearchCV(nb_tfidf, nb_params, cv=5, n_iter=10, scoring='accuracy', random_state=42)
random_search_nb.fit(X_train, y_train)
best_nb = random_search_nb.best_estimator_

In [None]:
# Save the best model, TF-IDF vectorizer, and label encoder
joblib.dump(best_nb, 'best_nb_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
# Evaluate the model
y_pred_nb_tfidf = best_nb.predict(X_test)
print("Best Naive Bayes with TF-IDF:")
print(classification_report(y_test, y_pred_nb_tfidf))

Best Naive Bayes with TF-IDF:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1878
           1       0.81      0.55      0.66       396

    accuracy                           0.90      2274
   macro avg       0.86      0.76      0.80      2274
weighted avg       0.89      0.90      0.89      2274

