In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score

In [5]:
#Loading the data into the pandas dataframe
dataset = pd.read_csv('mail_data.csv')

In [6]:
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#Replacing the null values with a null string
mail_data = dataset.where((pd.notnull(dataset)), '')

In [9]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Checking the number of rows and columns
mail_data.shape

(5572, 2)

In [11]:
#Label encoding of the Categorical variable

mail_data['Category'] = np.where(mail_data['Category']== 'spam', 0,1) 

In [12]:
mail_data['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [13]:
#Separaring the data as Text and Label

In [14]:
X = mail_data['Message']
y = mail_data['Category']

In [15]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int32

In [16]:
#Splitting the data into Training and Test Data

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [19]:
print(X.shape, X_train.shape, y_test.shape)

(5572,) (4457,) (1115,)


In [20]:
#Feature extraction: Transform the test data to feature vector that can be used as input to the Logistic regression

In [25]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase =True)

In [26]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [31]:
#Converting y_train and y_test as integers
mail_data['Category'].dtype

dtype('int32')

In [35]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [36]:
#Training the Model using LogisticRegression

In [37]:
model = LogisticRegression()

In [39]:
model.fit(X_train_features, y_train)

In [41]:
#Evaluating the Trained model: Prediction on the trained data using accuracy score
train_pred = model.predict(X_train_features)
train_accuracy = accuracy_score(train_pred, y_train)
print('The accuracy on training data:', train_accuracy)

The accuracy on training data: 0.9681400044873233


In [43]:
#Evaluating the Trained model: Prediction on the trained data using ROC AUC score
train_pred = model.predict(X_train_features)
train_accuracy = roc_auc_score(train_pred, y_train)
print('ROC AUC on training data:', train_accuracy)

ROC AUC on training data: 0.9793617818250904


In [45]:
#Evaluating the Test model: Prediction on the test data using accuracy score
test_pred = model.predict(X_test_features)
test_accuracy = accuracy_score(test_pred, y_test)
print('The accuracy on test data:', test_accuracy)

The accuracy on test data: 0.9704035874439462


In [46]:
#Evaluating the Test model: Prediction on the test data using ROC AUC score
test_pred = model.predict(X_test_features)
test_accuracy = roc_auc_score(test_pred, y_test)
print('ROC AUC on test data:', test_accuracy)

ROC AUC on test data: 0.9796736391563977


In [None]:
#i didnt build the predictive system considering the the "Spam" to "Ham" ratio shows the dataset to be imbalanced and accuracy_score might really not do justice to the accuracy of the dataset and also in its Prediction
#this dataset is still subject to reactification, prolly i'll try undersampling the majority to get a more balanced dataset. Thanks