# Importing libraries

In [48]:
import re
import csv
import nltk
import pandas
import sklearn
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split, learning_curve
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords

# Function for loading data from filepath

In [None]:
def load_data(file_path):
    try:
        return pandas.read_csv(file_path)
    except FileNotFoundError:
        print(f"The file at path '{file_path}' was not found.")
        return None
    
texts = load_data("emails.csv")

# Loading data from filepath

In [50]:
texts = load_data("emails.csv")
texts

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


# Function for preprocessing data

In [None]:
# Dowloading NLTK stopwords
nltk.download("stopwords")

def preprocess_data(data):

    # Remove characters other than English letters and digits
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

    # Convert to lowercase
    data['text'] = data['text'].apply(lambda x: x.lower())

    # Remove stopwords
    s = set(stopwords.words("english"))
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in s and word]))

    return data

texts = preprocess_data(texts)

# Preprocessing data

In [49]:
texts = preprocess_data(texts)
texts

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject money get software cds software compat...,1
...,...,...
5723,subject research development charges gpg forwa...,0
5724,subject receipts visit jim thanks invitation v...,0
5725,subject enron case study update wow day super ...,0
5726,subject interest david please call shirley cre...,0


# Function for splitting the data

In [None]:
# Data splitting: 80% to training data, 5% to validation, 15% to test data
def split_data(data, test_size = 0.2, validation_size = 0.25, output_path = './'):

    # Split the data into train and test sets
    train_data, test_data = train_test_split(data, test_size = test_size, random_state = 1)

    # Further split the test data into validation and test sets
    validation_data, test_data = train_test_split(test_data, test_size = validation_size, random_state = 1)

    train_data.to_csv(f'{output_path}/train.csv', index = False)
    validation_data.to_csv(f'{output_path}/validation.csv', index = False)
    test_data.to_csv(f'{output_path}/test.csv', index = False)

# Splitting the data

In [52]:
split_data(texts)