<a href="https://colab.research.google.com/github/Starzenpro/Masilo/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
# Load the SMS Spam Collection dataset
# The dataset is tab-separated and has no header row
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Split data into training and testing sets (replace with your desired split)
from sklearn.model_selection import train_test_split

train_messages, test_messages, train_labels, test_labels = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Convert labels to numeric (0 for ham, 1 for spam)
train_labels_numeric = train_labels.map({'ham': 0, 'spam': 1})
test_labels_numeric = test_labels.map({'ham': 0, 'spam': 1})

In [None]:
# Create and train the model
model = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('classifier', MultinomialNB())     # Naive Bayes classifier for text data
])
model.fit(train_messages, train_labels_numeric)

In [None]:
# Define prediction function
def predict_message(message):
    # Get probability prediction for the input message
    proba = model.predict_proba([message])[0]
    # Probability that the message is spam (class 1)
    prob_spam = proba[1]
    # Determine label based on probability threshold (0.5)
    label = 'spam' if prob_spam >= 0.5 else 'ham'
    return [prob_spam, label]


In [None]:
# Test the function
print(predict_message("Hello, how are you doing today?"))
print(predict_message("WINNER!! Claim your free iPhone now!"))

In [None]:
# Load a sample dataset (replace with your actual data loading)
data = {'message': ["Hello, how are you?", "WINNER! Claim your prize!", "Meeting at 3 PM.", "Free money!", "Good morning."],
        'label': ["ham", "spam", "ham", "spam", "ham"]}
df = pd.DataFrame(data)

In [None]:
# Split data into training and testing sets (replace with your desired split)
from sklearn.model_selection import train_test_split

train_messages, test_messages, train_labels, test_labels = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create and train the model
model = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('classifier', MultinomialNB())     # Naive Bayes classifier for text data
])
model.fit(train_messages, train_labels_numeric)

# Make predictions on the test set
predictions = model.predict(test_messages)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels_numeric, predictions)
precision = precision_score(test_labels_numeric, predictions)
recall = recall_score(test_labels_numeric, predictions)
f1 = f1_score(test_labels_numeric, predictions)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
f1 = f1_score(test_labels_numeric, predictions)

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip

In [None]:
import pandas as pd

# Load the dataset
# The dataset is tab-separated and has no header row
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Display the first few rows
display(df.head())

In [None]:
# Define prediction function
def predict_message(message):
    # Get probability prediction for the input message
    proba = model.predict_proba([message])[0]
    # Probability that the message is spam (class 1)
    prob_spam = proba[1]
    # Determine label based on probability threshold (0.5)
    label = 'spam' if prob_spam >= 0.5 else 'ham'
    return [prob_spam, label]

# Test the predict_message function with custom messages

# Example 1: A likely ham message
message1 = "Just wanted to confirm our meeting for tomorrow at 10 AM."
prediction1 = predict_message(message1)
print(f"Message: '{message1}'")
print(f"Prediction: Probability of spam = {prediction1[0]:.4f}, Label = {prediction1[1]}")

print("-" * 30)

# Example 2: A likely spam message
message2 = "URGENT! You have won a prize! Click here to claim now!"
prediction2 = predict_message(message2)
print(f"Message: '{message2}'")
print(f"Prediction: Probability of spam = {prediction2[0]:.4f}, Label = {prediction2[1]}")

print("-" * 30)

# Example 3: Another custom message
message3 = "Can you pick up some groceries on your way home?"
prediction3 = predict_message(message3)
print(f"Message: '{message3}'")
print(f"Prediction: Probability of spam = {prediction3[0]:.4f}, Label = {prediction3[1]}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(test_labels_numeric, predictions)

# Display the confusion matrix using seaborn and matplotlib
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Task
Improve the performance of the spam classification model.

## Data preprocessing

### Subtask:
Explore techniques like text cleaning (removing punctuation, stop words, etc.), stemming, and lemmatization to improve the quality of the text data.


**Reasoning**:
Import necessary libraries for text preprocessing and define a function to clean the text data, then apply the function to the training and testing messages.



In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources if not already downloaded
try:
    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')
    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()


# Define a text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # Apply stemming or lemmatization (choose one)
    # text = ' '.join([stemmer.stem(word) for word in text.split()]) # Stemming
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization
    return text

# Apply the cleaning function to the training and testing messages
train_messages_cleaned = train_messages.apply(clean_text)
test_messages_cleaned = test_messages.apply(clean_text)

# Display the first few cleaned messages from training set
print("Original training messages:")
print(train_messages.head())
print("\nCleaned training messages:")
print(train_messages_cleaned.head())

**Reasoning**:
The text cleaning step is complete. The next step is to update the model training and evaluation to use the cleaned text data.



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create and train the model with cleaned data
model_cleaned = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('classifier', MultinomialNB())     # Naive Bayes classifier for text data
])
model_cleaned.fit(train_messages_cleaned, train_labels_numeric)

# Make predictions on the cleaned test set
predictions_cleaned = model_cleaned.predict(test_messages_cleaned)

# Calculate evaluation metrics for the model trained on cleaned data
accuracy_cleaned = accuracy_score(test_labels_numeric, predictions_cleaned)
precision_cleaned = precision_score(test_labels_numeric, predictions_cleaned)
recall_cleaned = recall_score(test_labels_numeric, predictions_cleaned)
f1_cleaned = f1_score(test_labels_numeric, predictions_cleaned)

# Print the results
print("Metrics with cleaned data:")
print(f"Accuracy: {accuracy_cleaned:.2f}")
print(f"Precision: {precision_cleaned:.2f}")
print(f"Recall: {recall_cleaned:.2f}")
print(f"F1 Score: {f1_cleaned:.2f}")

## Feature engineering

### Subtask:
Investigate using different text vectorization methods (e.g., CountVectorizer, or more advanced techniques like word embeddings) or incorporating other relevant features.


**Reasoning**:
The subtask requires creating a new pipeline with CountVectorizer and MultinomialNB, training it on cleaned data, making predictions, and evaluating the performance. This can be done in a single code block by following the steps outlined in the instructions.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a new Pipeline with CountVectorizer and MultinomialNB
model_countvectorizer = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to CountVectorizer features
    ('classifier', MultinomialNB())     # Naive Bayes classifier for text data
])

# Train the model on the cleaned training messages and numeric labels
model_countvectorizer.fit(train_messages_cleaned, train_labels_numeric)

# Make predictions on the cleaned test messages
predictions_countvectorizer = model_countvectorizer.predict(test_messages_cleaned)

# Calculate evaluation metrics for the model trained with CountVectorizer
accuracy_countvectorizer = accuracy_score(test_labels_numeric, predictions_countvectorizer)
precision_countvectorizer = precision_score(test_labels_numeric, predictions_countvectorizer)
recall_countvectorizer = recall_score(test_labels_numeric, predictions_countvectorizer)
f1_countvectorizer = f1_score(test_labels_numeric, predictions_countvectorizer)

# Print the results
print("Metrics with CountVectorizer:")
print(f"Accuracy: {accuracy_countvectorizer:.2f}")
print(f"Precision: {precision_countvectorizer:.2f}")
print(f"Recall: {recall_countvectorizer:.2f}")
print(f"F1 Score: {f1_countvectorizer:.2f}")

## Model selection

### Subtask:
Experiment with different classification algorithms (e.g., Support Vector Machines, Logistic Regression, or deep learning models) to see if they perform better than Naive Bayes for this dataset.


**Reasoning**:
Import the necessary classification algorithms and create and train the new pipelines using CountVectorizer. Then make predictions and calculate the evaluation metrics.



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create Logistic Regression pipeline with CountVectorizer
logreg_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Create SVC pipeline with CountVectorizer
svc_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC(probability=True)) # probability=True is needed for predict_proba later if needed
])

# Train the Logistic Regression pipeline
logreg_pipeline.fit(train_messages_cleaned, train_labels_numeric)

# Train the SVC pipeline
svc_pipeline.fit(train_messages_cleaned, train_labels_numeric)

# Make predictions with Logistic Regression
logreg_predictions = logreg_pipeline.predict(test_messages_cleaned)

# Make predictions with SVC
svc_predictions = svc_pipeline.predict(test_messages_cleaned)

# Calculate and print metrics for Logistic Regression
logreg_accuracy = accuracy_score(test_labels_numeric, logreg_predictions)
logreg_precision = precision_score(test_labels_numeric, logreg_predictions)
logreg_recall = recall_score(test_labels_numeric, logreg_predictions)
logreg_f1 = f1_score(test_labels_numeric, logreg_predictions)

print("Metrics for Logistic Regression with CountVectorizer:")
print(f"Accuracy: {logreg_accuracy:.2f}")
print(f"Precision: {logreg_precision:.2f}")
print(f"Recall: {logreg_recall:.2f}")
print(f"F1 Score: {logreg_f1:.2f}")
print("-" * 30)

# Calculate and print metrics for SVC
svc_accuracy = accuracy_score(test_labels_numeric, svc_predictions)
svc_precision = precision_score(test_labels_numeric, svc_predictions)
svc_recall = recall_score(test_labels_numeric, svc_predictions)
svc_f1 = f1_score(test_labels_numeric, svc_predictions)

print("Metrics for SVC with CountVectorizer:")
print(f"Accuracy: {svc_accuracy:.2f}")
print(f"Precision: {svc_precision:.2f}")
print(f"Recall: {svc_recall:.2f}")
print(f"F1 Score: {svc_f1:.2f}")

## Hyperparameter tuning

### Subtask:
Optimize the hyperparameters of the chosen model and the vectorizer using techniques like Grid Search or Random Search to find the best configuration.


**Reasoning**:
Import necessary modules and define the parameter grid for GridSearchCV.



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Define a pipeline for Logistic Regression with CountVectorizer
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(solver='liblinear', max_iter=1000)) # Use 'liblinear' for smaller datasets and add max_iter
])

# Define the parameter grid
param_grid = {
    'vectorizer__max_features': [1000, 2000, 3000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

**Reasoning**:
Instantiate and fit GridSearchCV to find the best parameters and evaluate the best model on the test set.



In [None]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'], refit='f1')

# Fit GridSearchCV to the cleaned training data
grid_search.fit(train_messages_cleaned, train_labels_numeric)

# Print the best parameters and best score
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)
print("\nBest cross-validation F1 score:")
print(f"{grid_search.best_score_:.2f}")

# Evaluate the best model on the cleaned test data
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(test_messages_cleaned)

# Calculate and print evaluation metrics on the test set
test_accuracy = accuracy_score(test_labels_numeric, test_predictions)
test_precision = precision_score(test_labels_numeric, test_predictions)
test_recall = recall_score(test_labels_numeric, test_predictions)
test_f1 = f1_score(test_labels_numeric, test_predictions)

print("\nMetrics on the cleaned test set with the best model:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1 Score: {test_f1:.2f}")

## Cross-validation

### Subtask:
Implement cross-validation to get a more robust estimate of the model's performance and ensure it generalizes well to unseen data.


**Reasoning**:
Implement cross-validation using the best model found in the previous step to get a more robust performance estimate.



In [None]:
from sklearn.model_selection import cross_val_score

# Use cross_val_score with the best model and cleaned training data
cv_scores = cross_val_score(best_model, train_messages_cleaned, train_labels_numeric, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])

# Print the mean and standard deviation of the cross-validation scores
print("Cross-validation scores:")
print(f"Mean Accuracy: {cv_scores['test_accuracy'].mean():.2f} (+/- {cv_scores['test_accuracy'].std():.2f})")
print(f"Mean Precision: {cv_scores['test_precision'].mean():.2f} (+/- {cv_scores['test_precision'].std():.2f})")
print(f"Mean Recall: {cv_scores['test_recall'].mean():.2f} (+/- {cv_scores['test_recall'].std():.2f})")
print(f"Mean F1 Score: {cv_scores['test_f1'].mean():.2f} (+/- {cv_scores['test_f1'].std():.2f})")

**Reasoning**:
The previous command failed because `cross_val_score`'s `scoring` parameter expects a single string or a callable, not a list of strings. I need to use `cross_validate` instead to get multiple scoring metrics.



In [None]:
from sklearn.model_selection import cross_validate

# Use cross_validate with the best model and cleaned training data
scoring = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(best_model, train_messages_cleaned, train_labels_numeric, cv=5, scoring=scoring)

# Print the mean and standard deviation of the cross-validation scores
print("Cross-validation results:")
for metric in scoring:
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"Mean {metric.capitalize()}: {mean_score:.2f} (+/- {std_score:.2f})")

## Error analysis

### Subtask:
Analyze the instances where the model makes incorrect predictions (false positives and false negatives) to gain insights into the model's weaknesses and identify areas for improvement.


**Reasoning**:
Get predictions from the best model, create a DataFrame with original and predicted labels, and then filter for false positives and false negatives to analyze misclassifications.



In [None]:
# 1. Get the predicted labels from the best_model on the test_messages_cleaned
test_predictions = best_model.predict(test_messages_cleaned)

# 2. Create a DataFrame containing the original test_messages, test_labels, and the test_predictions
results_df = pd.DataFrame({
    'original_message': test_messages,
    'true_label': test_labels,
    'predicted_label': test_predictions
})

# Convert predicted_label back to 'ham'/'spam' for easier comparison with true_label
results_df['predicted_label_str'] = results_df['predicted_label'].map({0: 'ham', 1: 'spam'})

# 3. Filter this DataFrame to identify false positives (where the true label is 'ham' but the prediction is 'spam')
false_positives = results_df[(results_df['true_label'] == 'ham') & (results_df['predicted_label_str'] == 'spam')]

# 4. Filter the DataFrame to identify false negatives (where the true label is 'spam' but the prediction is 'ham')
false_negatives = results_df[(results_df['true_label'] == 'spam') & (results_df['predicted_label_str'] == 'ham')]

# 5. Display the false positives and false negatives DataFrames
print("False Positives:")
display(false_positives)

print("\nFalse Negatives:")
display(false_negatives)

## Summary:

### Data Analysis Key Findings

*   Text cleaning techniques (lowercase conversion, punctuation removal, stop word removal, and lemmatization) were applied, resulting in a model trained on cleaned data achieving an accuracy of 0.97, precision of 1.00, recall of 0.78, and F1 score of 0.88.
*   Using `CountVectorizer` for text vectorization improved the model's performance compared to `TfidfVectorizer`, with metrics on cleaned data showing an accuracy of 0.99, precision of 0.97, recall of 0.92, and F1 score of 0.94.
*   Exploring different classification algorithms showed that both Logistic Regression and SVC achieved high accuracy (0.98) and precision (1.00) when used with `CountVectorizer`. Logistic Regression slightly outperformed SVC with a recall of 0.89 and an F1 score of 0.94.
*   Hyperparameter tuning using `GridSearchCV` with Logistic Regression and `CountVectorizer` identified the best parameters as `{'classifier__C': 10, 'vectorizer__max_features': 3000, 'vectorizer__ngram_range': (1, 2)}`. The model with these parameters achieved a cross-validation F1 score of 0.91 and test set metrics of Accuracy: 0.99, Precision: 0.99, Recall: 0.90, F1 Score: 0.94.
*   Cross-validation results using the best model showed consistent performance across 5 folds, with mean scores and standard deviations reported for accuracy, precision, recall, and F1.
*   Error analysis revealed a low number of false positives (only one instance) but a notable number of false negatives, indicating the model struggles to identify certain types of spam messages.

### Insights or Next Steps

*   Focus on techniques to improve the model's ability to detect diverse spam patterns to reduce false negatives, potentially through advanced feature engineering or exploring more sophisticated models.
*   Investigate the specific characteristics of the messages misclassified as false negatives to identify common themes or features that the current model is missing.


## Data preprocessing

### Subtask:
Explore techniques like text cleaning (removing punctuation, stop words, etc.), stemming, and lemmatization to improve the quality of the text data.

**Reasoning**:
Import necessary libraries for text preprocessing and define a function to clean the text data, then apply the function to the training and testing messages.

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources if not already downloaded
try:
    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')
    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()


# Define a text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # Apply stemming or lemmatization (choose one)
    # text = ' '.join([stemmer.stem(word) for word in text.split()]) # Stemming
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization
    return text

# Apply the cleaning function to the training and testing messages
train_messages_cleaned = train_messages.apply(clean_text)
test_messages_cleaned = test_messages.apply(clean_text)

# Display the first few cleaned messages from training set
print("Original training messages:")
print(train_messages.head())
print("\nCleaned training messages:")
print(train_messages_cleaned.head())

**Reasoning**:
The subtask requires creating a new pipeline with CountVectorizer and MultinomialNB, training it on cleaned data, making predictions, and evaluating the performance. This can be done in a single code block by following the steps outlined in the instructions.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a new Pipeline with CountVectorizer and MultinomialNB
model_countvectorizer = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to CountVectorizer features
    ('classifier', MultinomialNB())     # Naive Bayes classifier for text data
])

# Train the model on the cleaned training messages and numeric labels
model_countvectorizer.fit(train_messages_cleaned, train_labels_numeric)

# Make predictions on the cleaned test messages
predictions_countvectorizer = model_countvectorizer.predict(test_messages_cleaned)

# Calculate evaluation metrics for the model trained with CountVectorizer
accuracy_countvectorizer = accuracy_score(test_labels_numeric, predictions_countvectorizer)
precision_countvectorizer = precision_score(test_labels_numeric, predictions_countvectorizer)
recall_countvectorizer = recall_score(test_labels_numeric, predictions_countvectorizer)
f1_countvectorizer = f1_score(test_labels_numeric, predictions_countvectorizer)

# Print the results
print("Metrics with CountVectorizer:")
print(f"Accuracy: {accuracy_countvectorizer:.2f}")
print(f"Precision: {precision_countvectorizer:.2f}")
print(f"Recall: {recall_countvectorizer:.2f}")
print(f"F1 Score: {f1_countvectorizer:.2f}")

**Reasoning**:
Import the necessary classification algorithms and create and train the new pipelines using CountVectorizer. Then make predictions and calculate the evaluation metrics.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create Logistic Regression pipeline with CountVectorizer
logreg_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# Create SVC pipeline with CountVectorizer
svc_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC(probability=True)) # probability=True is needed for predict_proba later if needed
])

# Train the Logistic Regression pipeline
logreg_pipeline.fit(train_messages_cleaned, train_labels_numeric)

# Train the SVC pipeline
svc_pipeline.fit(train_messages_cleaned, train_labels_numeric)

# Make predictions with Logistic Regression
logreg_predictions = logreg_pipeline.predict(test_messages_cleaned)

# Make predictions with SVC
svc_predictions = svc_pipeline.predict(test_messages_cleaned)

# Calculate and print metrics for Logistic Regression
logreg_accuracy = accuracy_score(test_labels_numeric, logreg_predictions)
logreg_precision = precision_score(test_labels_numeric, logreg_predictions)
logreg_recall = recall_score(test_labels_numeric, logreg_predictions)
logreg_f1 = f1_score(test_labels_numeric, logreg_predictions)

print("Metrics for Logistic Regression with CountVectorizer:")
print(f"Accuracy: {logreg_accuracy:.2f}")
print(f"Precision: {logreg_precision:.2f}")
print(f"Recall: {logreg_recall:.2f}")
print(f"F1 Score: {logreg_f1:.2f}")
print("-" * 30)

# Calculate and print metrics for SVC
svc_accuracy = accuracy_score(test_labels_numeric, svc_predictions)
svc_precision = precision_score(test_labels_numeric, svc_predictions)
svc_recall = recall_score(test_labels_numeric, svc_predictions)
svc_f1 = f1_score(test_labels_numeric, svc_predictions)

print("Metrics for SVC with CountVectorizer:")
print(f"Accuracy: {svc_accuracy:.2f}")
print(f"Precision: {svc_precision:.2f}")
print(f"Recall: {svc_recall:.2f}")
print(f"F1 Score: {svc_f1:.2f}")

## Hyperparameter tuning

### Subtask:
Optimize the hyperparameters of the chosen model and the vectorizer using techniques like Grid Search or Random Search to find the best configuration.

**Reasoning**:
Import necessary modules and define the parameter grid for GridSearchCV.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Define a pipeline for Logistic Regression with CountVectorizer
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(solver='liblinear', max_iter=1000)) # Use 'liblinear' for smaller datasets and add max_iter
])

# Define the parameter grid
param_grid = {
    'vectorizer__max_features': [1000, 2000, 3000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

**Reasoning**:
Instantiate and fit GridSearchCV to find the best parameters and evaluate the best model on the test set.

In [None]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'], refit='f1')

# Fit GridSearchCV to the cleaned training data
grid_search.fit(train_messages_cleaned, train_labels_numeric)

# Print the best parameters and best score
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)
print("\nBest cross-validation F1 score:")
print(f"{grid_search.best_score_:.2f}")

# Evaluate the best model on the cleaned test data
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(test_messages_cleaned)

# Calculate and print evaluation metrics on the test set
test_accuracy = accuracy_score(test_labels_numeric, test_predictions)
test_precision = precision_score(test_labels_numeric, test_predictions)
test_recall = recall_score(test_labels_numeric, test_predictions)
test_f1 = f1_score(test_labels_numeric, test_predictions)

print("\nMetrics on the cleaned test set with the best model:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1 Score: {test_f1:.2f}")

## Cross-validation

### Subtask:
Implement cross-validation to get a more robust estimate of the model's performance and ensure it generalizes well to unseen data.

**Reasoning**:
Implement cross-validation using the best model found in the previous step to get a more robust performance estimate.

In [None]:
from sklearn.model_selection import cross_validate

# Use cross_validate with the best model and cleaned training data
scoring = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(best_model, train_messages_cleaned, train_labels_numeric, cv=5, scoring=scoring)

# Print the mean and standard deviation of the cross-validation scores
print("Cross-validation results:")
for metric in scoring:
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"Mean {metric.capitalize()}: {mean_score:.2f} (+/- {std_score:.2f})")