In [None]:
import numpy as np
import pandas as pd

# In this section, we first load the dataset into a pandas dataframe and then perform some basic exploratory data analysis (EDA).

In [None]:
df = pd.read_csv('ml_dataset.csv', delimiter=';')
df.head()
#df

In [None]:
#Labels

df['Target']

In [None]:
#Emails content
df['Message']

In [None]:
df.drop(df.query(
    "Target == '' | Message == ''"
).index, inplace=True)

print(df.shape)

# Our text features are present in the Message column and their corresponding labels are in target

In [None]:
print(f"There are {len(df)} rows in the dataset.")

# Features extraction

In [None]:
subject_list = []
for test_string in df['Message']:
    start_idx = test_string.find('Subject:')
    end_idx = test_string[start_idx:].find('\n')
    new_string = test_string[start_idx:start_idx+end_idx]
    subject_list.append(new_string)


df_subject_list = pd.DataFrame(subject_list, columns =['Subject'])
df_subject_list

In [None]:
df_subject_list.drop(df_subject_list.query(
    "Subject == ''"
).index, inplace=True)

print(df_subject_list.shape)

In [None]:
date_list = []
for test_string in df['Message']:
    start_idx = test_string.find('Date:')
    end_idx = test_string[start_idx:].find('\n')
    new_string = test_string[start_idx:start_idx+end_idx]
    date_list.append(new_string)


df_date_list = pd.DataFrame(date_list, columns =['Date'])
df_date_list

In [None]:
df_date_list.drop(df_date_list.query(
    "Date == ''"
).index, inplace=True)

print(df_date_list.shape)

In [None]:
from_list = []
for test_string in df['Message']:
    start_idx = test_string.find('From:')
    end_idx = test_string[start_idx:].find('\n')
    new_string = test_string[start_idx:start_idx+end_idx]
    from_list.append(new_string)


df_from_list = pd.DataFrame(from_list, columns =['From'])

df_from_list

In [None]:
df_from_list.drop(df_from_list.query(
    "From == ''"
).index, inplace=True)

print(df_from_list.shape)

# Analyzing features

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

c = np.asarray(df['Target'])
c_count = Counter(c)
print(c_count)
plt.bar(c_count.keys(), c_count.values())

In [None]:
from collections import Counter
c = np.asarray(df_subject_list['Subject'])
c_count = Counter(c)
c_count

In [None]:
from collections import Counter
c = np.asarray(df_date_list['Date'])
c_count = Counter(c)
c_count

In [None]:
from collections import Counter
c = np.asarray(df_from_list['From'])
c_count = Counter(c)
c_count

# Testing and Training Data preparation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# The splitting of x and y data into four parts: x_train,y_train,x_test, and y_test

In [None]:
x_data=df_subject_list['Subject']
y_data=df['Target']

split =(int)(0.3*df.shape[0])
x_train=x_data[:split]
x_test=x_data[split:]
y_train=y_data[:split]
y_test=y_data[split:]

# Convert features to numbeurs

In [None]:
count_vector = CountVectorizer()  
extracted_features = count_vector.fit_transform(x_train)
print(extracted_features)

# Training the Model

In [None]:
tuned_parameters = {'kernel': ['rbf','linear'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]}
model = GridSearchCV(svm.SVC(), tuned_parameters)
model.fit(extracted_features,y_train)

print("Model Trained Successfully!")

# Computing Accuracy 

In [None]:
print("Accuracy of the model is: ",model.score(count_vector.transform(x_test),y_test)*100)