# The aim is to employ Logistic Regression and Term Frequency-Inverse Document Frequency (TF-IDF) for spam classification, comparing accuracies across datasets to identify the most effective preprocessing technique.

## Perform the following task with using inbuilt Python Libraries: acy.

#### - Perform Classification of data sets (Data 1 (Raw Data), Data 2 (Data with Lowercase), ,..... Data n ) using Logistic Regression. 
#### - Use Tf-Idf vectorizor for Feature Extraction. 
#### - Use 80% of data for training and 20% of data for testing.
#### - Check the accuracy of the model for each dataset. 
#### - Write conclusion for with data  (Data 1 (Raw Data), Data 2 (Data with Lowercase), ,..... Data n ), the Logistic Regression provides best accuracy.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
!pip install deep_translator

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('wordnet')

In [None]:
data = pd.read_csv("spam.csv", encoding="ISO-8859-1")
data.head()

In [None]:
data = data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
data.head()

In [None]:
df = pd.DataFrame(data)
df.head()

### 1. Lower Casing

In [None]:
# Task 1: Lowercasing
df['lowercased_v2'] = df['v2'].apply(lambda x: x.lower())
df.head()

### 2. Tokenization

In [None]:
# Task 2: Tokenization
# df['tokens'] = df['lowercased_v2'].apply(lambda x: re.findall(r'\b\w+\b', x))
df['tokens'] = df['lowercased_v2'].apply(lambda x: word_tokenize(x))
df.head()

### 3. Punctuation Mark Removal

In [None]:
# Task 3: Punctuation Mark Removal
df['cleaned_v2'] = df['tokens'].apply(lambda x: ''.join(char for char in x if char not in string.punctuation))
df.head()

### 4. Stop Word Removal

In [None]:
# Task 4: Stop Word Removal
stop_words = set(stopwords.words('english'))
df['filtered_v2'] = df['tokens'].apply(lambda x: ' '.join(word for word in x if word not in stop_words))
df.head()

### 5. Stemming

In [None]:
# Task 5: Stemming
stemmer = PorterStemmer()
df['stemmed_v2'] = df['tokens'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x))
df.head()

### 6. Lemmatization

In [None]:
# Task 6: Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_v2'] = df['tokens'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x))
df.head()

### 7. Translation

In [None]:
# Task 7: Translation
# translator = google_translator()
df['translated_v2'] = df['lowercased_v2'].apply(lambda x: GoogleTranslator(source='auto', target='es').translate(x))  # Translate to Spanish
df.head()

### 8. Emoji to v2

In [None]:
# Task 8: Emoji to v2
df['emoji_to_v2'] = df['v2'].apply(lambda x: emoji.demojize(x))
df.head()

## Perform the above tasks:

In [None]:
df.head()

In [None]:
data_columns = ['v2', 'lowercased_v2', 'tokens', 'cleaned_v2', 'filtered_v2', 'stemmed_v2', 'lemmatized_v2']
accuracies = []
precisions = []
recalls = []
f1s = []
for column in data_columns:
    X = df[column].astype(str)
    y = df['v1'] 
    # Training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)
    # Prediction
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"\nResults for {column}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1);
# Conclusion
best_index = accuracies.index(max(accuracies))
best_dataset = data_columns[best_index]
print(f'\nLogistic Regression provides the best accuracy with {best_dataset} having accuracy of {accuracies[best_index]}.')
best_index = precisions.index(max(precisions))
best_dataset = data_columns[best_index]
print(f'Logistic Regression provides the best precision with {best_dataset} having precision of {precisions[best_index]}.')
best_index = recalls.index(max(recalls))
best_dataset = data_columns[best_index]
print(f'Logistic Regression provides the best recall with {best_dataset} having recall of {recalls[best_index]}.')
best_index = f1s.index(max(f1s))
best_dataset = data_columns[best_index]
print(f'Logistic Regression provides the best F1 score with {best_dataset} having F1 score of {f1s[best_index]}.')