In [2]:
import pandas as pd

csv_file = "spam.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file, encoding='ISO-8859-1')

df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.dropna()
df = df.rename(columns={'v1': 'labels', 'v2': 'text'})

label_encoding = {'ham': 0, 'spam': 1}
df['labels'] = df['labels'].map(label_encoding)

df.head()


Unnamed: 0,labels,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#nltk.download('stopwords')

def clean_text(text):
    # Remove HTML tags using regular expressions
    clean_text = re.sub('<.*?>', '', text)

    # Remove digits and special symbols
    clean_text = re.sub(r'\d+', '', clean_text)  # Remove digits
    clean_text = re.sub(r'[^\w\s]', '', clean_text)  # Remove special symbols
    
    # Remove punctuation
    clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))

    # Convert text to lowercase
    clean_text = clean_text.lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = clean_text.split()
    clean_words = [word for word in words if word not in stop_words]
    clean_text = ' '.join(clean_words)

    # Stem words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in clean_words]
    clean_text = ' '.join(stemmed_words)

    return clean_text

In [4]:
df["features"] = df["text"].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,labels,text,features
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [5]:
df.to_csv("spam_new.csv", index=False)