In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pickle
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Assume df1 is already defined and contains 'lematized_texts' and 'label'

# TF-IDF Vectorization with n-grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Include unigrams and bigrams
X_tfidf = tfidf_vectorizer.fit_transform(df1['lematized_texts'])

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, df1['label'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Save the model and the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
