In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load CSV file and inspect columns
df = pd.read_csv("Training Data.csv")
df = df.iloc[:, :2]  
df.columns = ["label", "text"]

# Convert labels to binary (assuming 'ham' and 'spam')
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2)

# Convert text data to bag-of-words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Apply class weights to penalize false negatives (increase weight for spam class)
class_weights = {0: 1, 1: 2}  # Spam (1) gets a higher weight

model = LogisticRegression(class_weight=class_weights, max_iter=2000)
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")  
print(f"F1-Score: {f1:.4f}")

# Compute and print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9843
Precision: 0.9485
Recall: 0.9109
F1-Score: 0.9293
Confusion Matrix:
[[786   5]
 [  9  92]]


In [None]:
import joblib

# Save the trained model
joblib.dump(model, "Model.pkl")

print("Model saved successfully.")


Model and vectorizer saved successfully.
