In [2]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk

#Importing the dataset
dataset = pd.read_csv('spam.csv',encoding='latin-1')

dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
dataset.rename(columns={'v1':'target','v2':'text'},inplace=True)

# converting ham,spam into 0,1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset['target']=encoder.fit_transform(dataset['target'])

# check for duplicate values
dataset.duplicated().sum()
#remove duplicate values
dataset = dataset.drop_duplicates(keep='first')

#Data preprocessing
def transform_text(text):
    # lower case
    text =text.lower()
    
    # tokenization
    text = nltk.word_tokenize(text)
    
    # removing special characters
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    
    # Removing stop words and punctuation
    from nltk.corpus import stopwords
    import string
    for i in text:
            if i not in stopwords.words("english") and i not in string.punctuation:
                y.append(i)
    text = y[:]
    y.clear()
    
    # stemming
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)


# inserting transformed text into dataset
dataset['transformed_text'] = dataset['text'].apply(transform_text)


# 4. Modal Building
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)

# X is messages and y is ham or spam
X = tfidf.fit_transform(dataset['transformed_text']).toarray()
y = dataset['target'].values

# spliting training set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
rfc.fit(X_train,y_train)
y_pred_rfc = rfc.predict(X_test)

In [3]:
# Driver code
for __ in range(int(input("No.of test cases: "))):
    print("------Email/SMS Spam Classifier-------")
    print()
    input_sms=input("  Enter the message: ")

    # 1. preprocess
    transformed_sms = transform_text(input_sms)
    # 2. vectorize
    vector_input = tfidf.transform([transformed_sms])
    # 3. predict
    result = rfc.predict(vector_input)[0]
    # 4. display
    
    
    def colored(r, g, b, text):
        return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)
    
    if result == 1:
        print(colored(1,0,0,"   Spam"))
    else:
        print(colored(0,255,0,"  Not Spam"))
    print("---------------------------------------")

No.of test cases: 2
------Email/SMS Spam Classifier-------

  Enter the message: hii
[38;2;0;255;0m  Not Spam [38;2;255;255;255m
---------------------------------------
------Email/SMS Spam Classifier-------

  Enter the message: congratulations you won 1000 call on this number to get your prize.
[38;2;1;0;0m   Spam [38;2;255;255;255m
---------------------------------------
