Sure, I'll break down the process into individual cells without defining functions.

### Cell 1: Import Libraries


In [None]:
import pickle
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



### Cell 2: Load Pickle Files


In [None]:
# Load the Word2Vec vectorized matrices
with open('X_train_w2v.pkl', 'rb') as f:
    X_train_w2v = pickle.load(f)
with open('X_cv_w2v.pkl', 'rb') as f:
    X_cv_w2v = pickle.load(f)
with open('X_test_w2v.pkl', 'rb') as f:
    X_test_w2v = pickle.load(f)



### Cell 3: Apply SMOTE


In [None]:
# Assuming y_train, y_cv, and y_test are already defined
smote = SMOTE(random_state=42)

X_train_w2v_resampled, y_train_resampled = smote.fit_resample(X_train_w2v, y_train)
X_cv_w2v_resampled, y_cv_resampled = smote.fit_resample(X_cv_w2v, y_cv)
X_test_w2v_resampled, y_test_resampled = smote.fit_resample(X_test_w2v, y_test)



### Cell 4: Perform Grid Search CV


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, verbosity=0), 
                           param_grid=param_grid, 
                           cv=stratified_kfold, 
                           n_jobs=-1, 
                           verbose=3, 
                           scoring='accuracy')
grid_search.fit(X_cv_w2v_resampled, y_cv_resampled)
best_params = grid_search.best_params_
cv_scores = grid_search.cv_results_['mean_test_score']
print("Best Parameters:", best_params)



### Cell 5: Train Model


In [None]:
xgboost = XGBClassifier(n_estimators=best_params['n_estimators'], 
                        max_depth=best_params['max_depth'], 
                        learning_rate=best_params['learning_rate'], 
                        use_label_encoder=False, 
                        verbosity=0)
xgboost.fit(X_train_w2v_resampled, y_train_resampled)
print("Training completed.")



### Cell 6: Test and Evaluate Model


In [None]:
y_pred = xgboost.predict(X_test_w2v_resampled)
accuracy = accuracy_score(y_test_resampled, y_pred)
cm = confusion_matrix(y_test_resampled, y_pred)
report = classification_report(y_test_resampled, y_pred, output_dict=True)
print("Accuracy:", accuracy)
print("Classification Report:", classification_report(y_test_resampled, y_pred))



### Cell 7: Display Results


In [None]:
class_labels = np.unique(y_test_resampled)
df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
sns.heatmap(df_cm, annot=True, fmt="d")
plt.title("Confusion Matrix - Word2Vec")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

mse = [1 - x for x in cv_scores]
plt.figure(figsize=(8, 8))
plt.plot(range(len(cv_scores)), mse, linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
for i, (param, score) in enumerate(zip(range(len(cv_scores)), mse)):
    plt.annotate(f'({param}, {score:.3f})', (i, score), textcoords='data')
plt.title('CV Error vs Hyperparameter Values')
plt.xlabel('Hyperparameter Index')
plt.ylabel('CV Error')
plt.show()



This should cover the entire process step-by-step in individual cells, from importing libraries to displaying results.