# Spam Mail Detection
Classify emails as spam or ham using ML.

In [50]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("/content/mail_data.csv")

# Handle missing values
data = df.where((pd.notnull(df)), '')

# Convert labels: spam -> 0, ham/not spam -> 1
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'].isin(['ham']), 'Category'] = 1

# Features (text) and labels (target)
x = data['Message']
y = data['Category'].astype(int)



In [51]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=3)

In [53]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [54]:
print(x_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

In [55]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [56]:
model.fit(x_train_features, y_train)

In [58]:
prediction = model.predict(x_train_features)
accuracy = accuracy_score(y_train, prediction)

In [59]:
print("Acc on training data", accuracy)

Acc on training data 0.9676912721561588


In [60]:
prediction_test = model.predict(x_test_features)
accuracy_test = accuracy_score(y_test, prediction_test)

In [61]:
print("Acc on test data", accuracy_test)

Acc on test data 0.9668161434977578


In [66]:
# Multiple inputs for testing
inputs = [
    "Well people in my college are pretty chill",    # likely ham
    "Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!",  # spam
    "Are you coming to the class tomorrow?",         # ham
    "Lowest prices guaranteed! Buy cheap meds online!!!",  # spam
    "Don’t forget our meeting at 5 pm today.",       # ham
    "URGENT! Your account has been suspended. Verify immediately.", # spam
]

# Transform inputs using the same TF-IDF vectorizer
input_data_features = feature_extraction.transform(inputs)

# Predict
predictions = model.predict(input_data_features)

# Print results
for text, label in zip(inputs, predictions):
    print(f"Message: {text}")
    if label == 1:
        print("Prediction: Ham mail \n")
    else:
        print("Prediction: Spam mail \n")


Message: Well people in my college are pretty chill
Prediction: Ham mail 

Message: Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!
Prediction: Spam mail 

Message: Are you coming to the class tomorrow?
Prediction: Ham mail 

Message: Lowest prices guaranteed! Buy cheap meds online!!!
Prediction: Ham mail 

Message: Don’t forget our meeting at 5 pm today.
Prediction: Ham mail 

Message: URGENT! Your account has been suspended. Verify immediately.
Prediction: Ham mail 

