In [64]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/NEW_PROJECT/combined_data.csv")

print("Shape of Dataset: ",df.shape)
print("Null values: \n",df.isnull().sum())
print(df.isnull().any())
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape of Dataset:  (83448, 2)
Null values: 
 label    0
text     0
dtype: int64
label    False
text     False
dtype: bool


Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [65]:
#Distribution of labels (spam or not)

Distribution_label = df['label'].value_counts(normalize = True)*100
print("Percentage of Distributed label: \n",Distribution_label)

Percentage of Distributed label: 
 label
1    52.619595
0    47.380405
Name: proportion, dtype: float64


In [66]:
import re

def feature_extraction(txt):
  feature = {}
  feature['URL_Extraction'] = len(re.findall(r'https\S+|www\S+',txt))
  feature['NUM_Extraction'] = len(re.findall(r'\d+',txt))
  feature['Char_len_ex'] = len(txt)
  feature['Special_Char'] = len(re.findall(r'[^a-zA-Z0-9\s]',txt))
  feature['Word_count'] = len(txt.split())
  return feature

feature_new = df['text'].apply(feature_extraction).apply(pd.Series)

df = pd.concat([df,feature_new],axis=1)

In [67]:
df['URL_Extraction'].sum()
df['NUM_Extraction'].sum()
df['Char_len_ex'].sum()
df['Special_Char'].sum()

np.int64(2991005)

In [68]:

df.groupby('label')['Word_count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,39538.0,365.057944,984.32475,1.0,106.0,200.0,378.0,101984.0
1,43910.0,208.754634,338.269557,1.0,61.0,122.0,249.0,25393.0


In [69]:
df['text'] = df['text'].astype(str)
df['text'] = df['text'].apply(lambda x : x.lower())
df.head()

Unnamed: 0,label,text,URL_Extraction,NUM_Extraction,Char_len_ex,Special_Char,Word_count
0,1,ounce feather bowl hummingbird opec moment ala...,0,0,148,0,20
1,1,wulvob get your medircations online qnb ikud v...,0,0,808,1,103
2,0,computer connection from cnn com wednesday es...,0,0,2235,1,337
3,1,university degree obtain a prosperous future m...,0,0,592,1,76
4,0,thanks for all your answers guys i know i shou...,0,0,1362,32,222


In [70]:
import re
def handle_msg(txt):
  text = txt.lower()
  text = re.sub(r'http\S+|www\S+','URL',text)
  text = re.sub(r'\d+','NUM',text)
  text = re.sub(r'[^a-z0-9\s/\:]',' ',text)
  text = re.sub(r'\s+',' ',text).strip()

  return text

df['text'] = df['text'].apply(handle_msg)

In [71]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))


def remove_st(txt):
  text = txt.split()
  return ' '.join(word for word in text if word not in stop_words)

df['text'] = df['text'].apply(remove_st)

df['text'].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text
0,ounce feather bowl hummingbird opec moment ala...
1,wulvob get medircations online qnb ikud viagra...
2,computer connection cnn com wednesday escapenu...
3,university degree obtain prosperous future mon...
4,thanks answers guys know checked rsync manual ...


In [72]:
df.describe()

Unnamed: 0,label,URL_Extraction,NUM_Extraction,Char_len_ex,Special_Char,Word_count
count,83448.0,83448.0,83448.0,83448.0,83448.0,83448.0
mean,0.526196,0.003607,5.354005,1662.952725,35.84274,282.811775
std,0.499316,0.18558,29.231291,4178.578068,214.308859,724.818152
min,0.0,0.0,0.0,1.0,0.0,1.0
25%,0.0,0.0,0.0,449.0,1.0,80.0
50%,1.0,0.0,0.0,879.0,5.0,152.0
75%,1.0,0.0,2.0,1861.0,22.0,312.0
max,1.0,23.0,2950.0,598705.0,12741.0,101984.0


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(
    max_features = 5000,
    min_df = 2,
    stop_words = 'english'
)

x_idf = tf_idf.fit_transform(df['text'])



In [74]:
from scipy.sparse import hstack

features = df[['URL_Extraction','NUM_Extraction','Char_len_ex','Special_Char','Word_count']]

X = hstack([features.values,x_idf])
Y = df['label']

In [75]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42,stratify = Y)

In [76]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter = 5000,
    solver = 'liblinear',
    class_weight = 'balanced'
)

model.fit(x_train,y_train)

y_pred = model.predict(x_test)

In [77]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print("Accuracy : ",accuracy_score(y_test,y_pred))
print("Classification report: \n",classification_report(y_test,y_pred))
print("Confusion Matrix: ",confusion_matrix(y_test,y_pred))

Accuracy :  0.9756740563211503
Classification report: 
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      7908
           1       0.97      0.98      0.98      8782

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690

Confusion Matrix:  [[7666  242]
 [ 164 8618]]


In [78]:
SCAM_WORDS = {
    "urgent","verify","click","win","free","offer","congratulations",
    "account","blocked","suspended","limited","act","now",
    "otp","password","login","bank","kyc","refund","reward",
    "claim","security","alert","update","expire"
}


user_input = input("Enter a message: ")

def show_reasons(txt):
  because = []
  if re.search(r'http\S+|www\S+',txt):
    because.append("Email / Message contain URL!")

  if re.search(r'\d|\d+',txt):
    because.append("Contains OTP / Numeric Values!")

  if len(re.findall(r'[^a-zA-Z0-9\s]',txt)) > 20 :
    because.append("Special Characters appearce!")

  if len(txt)> 200:
    because.append("Usually long message!")

  if any(word in SCAM_WORDS for word in txt.lower().split()):
    because.append("Uses scam trigger word!")

  return because

reason = show_reasons(user_input)

from scipy.sparse import csr_matrix
def func(txt):
  feat_ex = feature_extraction(txt)
  clean_txt = handle_msg(txt)
  idf = tf_idf.transform([clean_txt])
  feat_array = np.array(list(feat_ex.values())).reshape(1,-1)
  feat_array = csr_matrix(feat_array)
  final_features = hstack([feat_array,idf])
  prediction = model.predict(final_features)
  return prediction[0]


label_smart = {0 : 'Not spam',
               1 : 'Spam'}

result = func(user_input)
predict = label_smart[result]
print(predict)
print('\n')

for reasons in reason:
  print(reasons)

print("Thank You for visiting our Email Fraud detection website!\n")
print("Have a Spamless day!")


Enter a message: er
Spam


Thank You for visiting our Email Fraud detection website!

Have a Spamless day!


In [79]:
import joblib

model = joblib.dump(model,"model.joblib")
tfidf = joblib.dump(tf_idf,"tfidf.joblib")
