In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv("/content/final_dataset1.csv")
df

Unnamed: 0,label,Hindi
0,ham,विषय: एनरॉन मेथनॉल; मीटर # : 988291\r\nयह उस न...
1,ham,विषय: 9 जनवरी 2001 के लिए एचपीएल नामांकन\r\n(स...
2,ham,"विषय: नियॉन रिट्रीट\r\nहो हो हो, हम वर्ष के उस..."
3,spam,"विषय: फ़ोटोशॉप, विंडोज़, कार्यालय। सस्ता । मुख..."
4,ham,विषय: पुन: भारतीय स्प्रिंग्स\r\nयह डील टेको पी...
...,...,...
11555,ham,मेरे बारे में आपकी राय? 1. अधिक 2. जड़ा 3. कुस...
11556,ham,"नवीनतम 8 पर, जी अभी भी वहाँ है यदि आप कुछ बारू..."
11557,ham,प्रभा..मुझे क्षमा करें..सचमुच..हृदय से मुझे क्...
11558,ham,लोल ठीक है आपने माफ कर दिया :)


# Pre-Processing

In [None]:
lbl = LabelEncoder()
y = lbl.fit_transform(df['label'])

In [None]:
x = df['Hindi']

In [None]:
def hindi_tokenizer(text):
    tokens = nltk.word_tokenize(text,language='hindi',preserve_line=True)

    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))

    # remove punctuation from each word
    words = [re_punc.sub('',w) for w in tokens]
    return words

In [None]:
tfidf = TfidfVectorizer(tokenizer=hindi_tokenizer)
x_vect = tfidf.fit_transform(x)



# Choose one of the sampling methods and classification model that gives best results.

## Sampling
#### SMOTE Method

In [None]:
smote = SMOTE(random_state=0)

In [None]:
x_vect.shape,y.shape

((11560, 210345), (11560,))

In [None]:
x_sm, y_sm = smote.fit_resample(x_vect,y)

In [None]:
x_sm.shape, y_sm.shape

((16138, 210345), (16138,))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_sm,y_sm,test_size=0.3,random_state=24)

In [None]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((11296, 210345), (11296,), (4842, 210345), (4842,))

# SVM
### SMOTE

In [None]:
svm = SVC(kernel='linear',random_state=0)

In [None]:
svm.fit(x_train,y_train)

In [None]:
y_pred = svm.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

0.9855441038682907

In [None]:
f1_score(y_test,y_pred)

0.9853896103896104

In [None]:
import pickle

In [None]:
pickle_out=open("svm.pkl","wb")
pickle.dump(svm,pickle_out)
pickle_out.close()

In [None]:
import streamlit as st

def predict_emails(text):
    txt_vect = tfidf.transform([text])
    prediction = svm.predict(txt_vect)
    return f"The Mail {text} is: {lbl.inverse_transform(prediction)[0]}."

def main():
    st.title("Email Spam/Ham Classification")
    st.write("Enter an email below to classify if it's spam or ham!")

    html_temp = """<div style = "background-color:#25246 ; padding:10px">
    <h2 style = "color:white; text-align:center;"> Spam Email Classification </h2>
    </div>
    """

    st.markdown(html_temp, unsafe_allow_html= True) # To render the html code as html

    # Getting the input from the user
    input_text = st.text_input("Enter the message")

    spam_html = """
    <div style = background-color:#F4D03F; padding:10px >
    <h2 style = "color:white; text-align:center;"> This Email is Spam </h2>
    </div>
    """

    ham_html = """
    <div style = background-color:#F4D03F; padding:10px >
    <h2 style = "color:white; text-align:center;"> This Email is Ham </h2>
    </div>
    """

    if st.button("Click to predict"):
        output = predict_emails([[text]])


    st.success("The probability {}".format(output))



# Sampling
### ADASYN Preprocessing

In [None]:
ada = ADASYN(random_state=2,n_jobs=1)

In [None]:
x_vect.shape,y.shape

((11560, 210345), (11560,))

In [None]:
x_ada, y_ada = ada.fit_resample(x_vect,y)



In [None]:
x_ada.shape, y_ada.shape

((16214, 210345), (16214,))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_ada,y_ada,test_size=0.3,random_state=24)

In [None]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((11349, 210345), (11349,), (4865, 210345), (4865,))

# SVM

### ADASYN

In [None]:
svm = SVC(kernel='linear',random_state=0)

In [None]:
svm.fit(x_train,y_train)

In [None]:
y_pred = svm.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred)

0.982322713257965

In [None]:
f1_score(y_test,y_pred)

0.9825699229833805

# Model Evaluation


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2424
           1       0.97      0.99      0.98      2441

    accuracy                           0.98      4865
   macro avg       0.98      0.98      0.98      4865
weighted avg       0.98      0.98      0.98      4865

