# Loading Datasets

In [7]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
val_df = pd.read_csv('/content/validation.csv')

## Preparing dataset for training

In [8]:
# Define features (X) and labels (y) for train, test, and validation
X_train = train_df['preprocessed_message']
y_train = train_df['label']
X_test = test_df['preprocessed_message']
y_test = test_df['label']
X_val = val_df['preprocessed_message']
y_val = val_df['label']

In [10]:
# Vectorizing X-datasets
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# Replace NaN values with empty strings
X_train = X_train.fillna('')
X_test = X_test.fillna('')
X_val = X_val.fillna('')

# Now apply CountVectorizer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_val_vectorized = vectorizer.transform(X_val)


## Model-1 Naive Bayes Classifier

In [None]:
# Train the Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precison_score

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9895287958115183
Confusion Matrix:
[[851   5]
 [  7 283]]


In [16]:
# Calculate Precison
precison = precison_score(y_test, y_pred)
print("Precison:", precison)

Precison: 0.98262836548762


## Model 2-- SVM Classifire

In [None]:
from sklearn.svm import SVC

# Train the SVM classifier
classifier_svm = SVC(kernel='rbf')
classifier_svm.fit(X_train, y_train)

# Predict labels for the test set
y_pred_svm = classifier_svm.predict(X_test)

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Classifier Accuracy:", accuracy_svm)

# Confusion matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
print("SVM Classifier Confusion Matrix:")
print(conf_matrix_svm)


SVM Classifier Accuracy: 0.9790575916230366
SVM Classifier Confusion Matrix:
[[853   3]
 [ 21 269]]


In [19]:
# Calculate Precison
precison = precison_score(y_test, y_pred)
print("Precison:", precison)

Precison: 0.9882472923644838


## Modlel--3 Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train the Decision Tree classifier
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X_train, y_train)

# Predict labels for the test set
y_pred_dt = classifier_dt.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Classifier Accuracy:", accuracy_dt)

# Confusion matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
print("Decision Tree Classifier Confusion Matrix:")
print(conf_matrix_dt)


Decision Tree Classifier Accuracy: 0.962478184991274
Decision Tree Classifier Confusion Matrix:
[[836  20]
 [ 23 267]]


In [18]:
# Calculate Precison
precison = precison_score(y_test, y_pred)
print("Precison:", precison)

# Model Selection
- Among the three models, Naive Bayes gives the best accuracy and has the fewest false positive predictions. So, it's usually the best choice when you want to get accurate results and avoid false alarms.

- However, if your main goal is to minimize false positives (Type I errors), then Support Vector Machine (SVM) is a better option. SVM works well when you want to reduce false positives, even if it means having slightly lower accuracy.

- Ultimately, the choice between Naive Bayes and SVM depends on what you care about most. If you want the highest accuracy, go for Naive Bayes. But if minimizing false positives is more important, SVM is the better choice.