# Text Classification - SPAM DETECTION

In [None]:
#Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Loading the dataset into a dataframe
raw_data = pd.read_csv("/Users/kanikasinghal/Downloads/Task_1.csv")

In [None]:
raw_data.head(10)

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
#checking for null values
pd.isna(raw_data).sum()

labels    0
text      0
dtype: int64

In [None]:
# removing duplicates
cleaned_data = raw_data.drop_duplicates()

# Display the cleaned DataFrame
cleaned_data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Split the data into features (X) and target (y)
X = raw_data['text']
y = raw_data['labels']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')  # We use English stop words as a base

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Create confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Model Accuracy: 0.9722
Confusion Matrix:
[[2891    2]
 [  91  360]]


In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_tfidf)

# Calculate accuracy of the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy_best:.4f}")

# Create confusion matrix for the best model
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
print("Confusion Matrix (Best Model):")
print(conf_matrix_best)



Best parameters: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
Best cross-validation score: 0.9906521085705897
Best Model Accuracy: 0.9916
Confusion Matrix (Best Model):
[[2886    7]
 [  21  430]]


