In [None]:
# importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection & preprocessing

raw_mail_data = pd.read_csv('/content/mail_data.csv')

# replace the null values with a null(Empty) String
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

# label encoding - label spam as 0 and ham mail as 1

mail_data.loc[mail_data['Category']== 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category']== 'ham', 'Category',] = 1

# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

# splitting the train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 1)

# Feature Extraction
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

# Training Model

model = LogisticRegression()

# training the logistic regression model
model.fit(X_train_features, Y_train)

# Evaluating the trained model
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('Accuracy on the training data : ', accuracy_on_training_data)

# Evaluating the test model
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('Accuracy on the test data : ', accuracy_on_test_data)

input_mail = ["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"]

# Convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)

if (prediction[0] == 1):
  print("Ham Mail")
else:
  print("Spam Mail")