In [41]:
pip install translate emoji



In [50]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
#given dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

train, test = train_test_split(data, test_size=0.2)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train['v2'])
y_train = train['v1']

X_test = vectorizer.transform(test['v2'])
y_test = test['v1']

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy on given dataset:", accuracy)

Accuracy on given dataset: 0.9739910313901345


In [52]:
#puncutation
data = pd.read_csv('spam.csv', encoding='latin-1')

data['v2_lower'] = data['v2'].str.lower()
data['v2_Tokens'] = data['v2_lower'].apply(nltk.word_tokenize)
data['v2_No_Punctuation'] = data['v2_Tokens'].apply(lambda x: ' '.join([word for word in x if word.isalnum()]))

stop_words = set(stopwords.words('english'))
data['v2_No_Stopwords'] = data['v2_No_Punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

train, test = train_test_split(data, test_size=0.2)

if len(train['v1'].unique()) < 2:
    print("Training data contains only one class. Model training cannot proceed.")
else:

    vectorizer = TfidfVectorizer()

    Xnp_train = vectorizer.fit_transform(train['v2_No_Punctuation'])
    ynp_train = train['v1']

    Xnp_test = vectorizer.transform(test['v2_No_Punctuation'])
    ynp_test = test['v1']
    model = LogisticRegression()

    model.fit(Xnp_train, ynp_train)

    ynp_pred = model.predict(Xnp_test)
    accuracynp = metrics.accuracy_score(ynp_test, ynp_pred)
    print("Accuracy on no punctuation dataset:", accuracynp)

Accuracy on no punctuation dataset: 0.9704035874439462


In [53]:
#stemmed dataset
stemmer = PorterStemmer()
data['v2_Stemmed'] = data['v2_No_Stopwords'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

train, test = train_test_split(data, test_size=0.2)

if len(train['v1'].unique()) < 2:
    print("Training data contains only one class. Model training cannot proceed.")
else:
    vectorizer = TfidfVectorizer()
    Xs_train = vectorizer.fit_transform(train['v2_Stemmed'])
    ys_train = train['v1']

    Xs_test = vectorizer.transform(test['v2_Stemmed'])
    ys_test = test['v1']

    model = LogisticRegression()

    model.fit(Xs_train, ys_train)

    ys_pred = model.predict(Xs_test)

    accuracys = metrics.accuracy_score(ys_test, ys_pred)
    print("Accuracy on stemmed dataset:", accuracys)


Accuracy on stemmed dataset: 0.9587443946188341


In [54]:
#lemetaized dataset
lemmatizer = WordNetLemmatizer()
data['v2_Lemmatized'] = data['v2_No_Stopwords'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

train, test = train_test_split(data, test_size=0.2)

if len(train['v1'].unique()) < 2:
    print("Training data contains only one class. Model training cannot proceed.")
else:
    vectorizer = TfidfVectorizer()
    Xl_train = vectorizer.fit_transform(train['v2_Lemmatized'])
    yl_train = train['v1']

    Xl_test = vectorizer.transform(test['v2_Lemmatized'])
    yl_test = test['v1']

    model = LogisticRegression()

    model.fit(Xl_train, yl_train)

    yl_pred = model.predict(Xl_test)

    accuracyl = metrics.accuracy_score(yl_test, yl_pred)
    print("Accuracy on lemmatized dataset:", accuracyl)


Accuracy on lemmatized dataset: 0.9542600896860987


In [56]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv('spam.csv', encoding='latin-1')

train, test = train_test_split(data, test_size=0.2, random_state=42)

# Function to preprocess text data
def preprocess_data(df, column, preprocessing_function):
    df['processed_text'] = df[column].apply(preprocessing_function)
    return df

def train_logistic_regression(X_train, y_train, X_test, y_test):
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_test_tfidf)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return accuracy

# Raw Data
accuracy_raw = train_logistic_regression(train['v2'], train['v1'], test['v2'], test['v1'])
print("Accuracy on Raw Data:", accuracy_raw)

# Lowercase Data
train = preprocess_data(train, 'v2', lambda x: x.lower())
test = preprocess_data(test, 'v2', lambda x: x.lower())
accuracy_lowercase = train_logistic_regression(train['processed_text'], train['v1'], test['processed_text'], test['v1'])
print("Accuracy on Lowercase Data:", accuracy_lowercase)

# No Punctuation Data
train = preprocess_data(train, 'v2', lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.isalnum()]))
test = preprocess_data(test, 'v2', lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.isalnum()]))
accuracy_no_punctuation = train_logistic_regression(train['processed_text'], train['v1'], test['processed_text'], test['v1'])
print("Accuracy on No Punctuation Data:", accuracy_no_punctuation)

# Stemmed Data
stemmer = PorterStemmer()
train = preprocess_data(train, 'v2', lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
test = preprocess_data(test, 'v2', lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
accuracy_stemmed = train_logistic_regression(train['processed_text'], train['v1'], test['processed_text'], test['v1'])
print("Accuracy on Stemmed Data:", accuracy_stemmed)

# Lemmatized Data
lemmatizer = WordNetLemmatizer()
train = preprocess_data(train, 'v2', lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
test = preprocess_data(test, 'v2', lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
accuracy_lemmatized = train_logistic_regression(train['processed_text'], train['v1'], test['processed_text'], test['v1'])
print("Accuracy on Lemmatized Data:", accuracy_lemmatized)

# Conclusion
accuracies = {
    "Raw Data": accuracy_raw,
    "Lowercase Data": accuracy_lowercase,
    "No Punctuation Data": accuracy_no_punctuation,
    "Stemmed Data": accuracy_stemmed,
    "Lemmatized Data": accuracy_lemmatized
}

best_preprocessing = max(accuracies, key=accuracies.get)
print(f"\nLogistic Regression provides the best accuracy with {best_preprocessing} preprocessing.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy on Raw Data: 0.9659192825112107
Accuracy on Lowercase Data: 0.9659192825112107
Accuracy on No Punctuation Data: 0.9632286995515695
Accuracy on Stemmed Data: 0.9650224215246637
Accuracy on Lemmatized Data: 0.967713004484305

Logistic Regression provides the best accuracy with Lemmatized Data preprocessing.
