<a href="https://colab.research.google.com/github/Venchislav/Data-Science/blob/main/SpamDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('/content/sample_data/spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
df.dtypes

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object

In [None]:
# I don't really consider Unnamed: 2	Unnamed: 3	Unnamed: 4 as important here, so:

In [None]:
df = df.drop(['Unnamed: 2',	'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [None]:
df.columns

Index(['v1', 'v2'], dtype='object')

In [None]:
df = df.rename(columns={"v1": "type", "v2": "text"})

In [None]:
df.columns

Index(['type', 'text'], dtype='object')

In [None]:
df[df['type'] == 'spam']['text'].head()

2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: text, dtype: object

In [None]:
df['type'] = df['type'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
df.head()

Unnamed: 0,type,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download("stopwords")
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
train, test = train_test_split(df, test_size=0.3)

# **TEXT STEMMING AND PREPARATIONS**

In [None]:
sentence_ex = df.iloc[1]['text']
tokens = word_tokenize(sentence_ex, language='english')
tokens_no_punctuation = [i for i in tokens if i not in string.punctuation]

stopwords = stopwords.words('english')
tokens_no_sw_punct = [i for i in tokens_no_punctuation if i not in stopwords]

snowball = SnowballStemmer(language='english')
stemmed_tokens = [snowball.stem(i) for i in tokens_no_sw_punct]


print(f'Input: {sentence_ex}\n{"-"*20}')
print(f'Tokens: {tokens}\n{"-"*20}')
print(f'Tokens no punctuation: {tokens_no_punctuation}\n{"-"*20}')
print(f'Tokens no punctuation and stopwords: {tokens_no_sw_punct}\n{"-"*20}')
print(f'Stemmed Tokens: {stemmed_tokens}\n{"-"*20}')

Input: Ok lar... Joking wif u oni...
--------------------
Tokens: ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']
--------------------
Tokens no punctuation: ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']
--------------------
Tokens no punctuation and stopwords: ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...']
--------------------
Stemmed Tokens: ['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...']
--------------------


In [None]:
snowball = SnowballStemmer(language='english')
rus_stopwords = stopwords.words('english')

def tokenize_sentence(sentence: str, remove_sw: bool = True):
  tokens = word_tokenize(sentence, language='english')
  tokens = [i for i in tokens if i not in string.punctuation]
  if remove_sw:
    tokens = [i for i in tokens if i not in rus_stopwords]
  tokens = [snowball.stem(i) for i in tokens]
  return tokens

In [None]:
tokenize_sentence(df.iloc[2]['text'])

['free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005',
 'text',
 'fa',
 '87121',
 'receiv',
 'entri',
 'question',
 'std',
 'txt',
 'rate',
 't',
 'c',
 "'s",
 'appli',
 '08452810075over18',
 "'s"]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_sw=True))

features = vectorizer.fit_transform(train['text'])



# **TRAIN TIME**

In [None]:
model = DecisionTreeClassifier(random_state=0)

In [None]:
model.fit(features, train['type'])

In [None]:
model.predict(features[6])

array([1])

In [None]:
train['text'].iloc[6]

'This is the 2nd time we have tried 2 contact u. U have won the 750 Pound prize. 2 claim is easy, call 08712101358 NOW! Only 10p per min. BT-national-rate'

In [None]:
# haha, bullshit

In [None]:
model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_sw=True))),
    ("model", model)
])

In [None]:
model_pipeline.fit(train['text'], train['type'])



In [None]:
model_pipeline.predict(['Hello'])

array([0])

In [None]:
model_pipeline.predict(['You won 500$!!! You are the winner!!'])

array([1])

In [None]:
precision_score(y_true=test["type"], y_pred=model_pipeline.predict(test["text"]))


0.8645833333333334

In [None]:
recall_score(y_true=test["type"], y_pred=model_pipeline.predict(test["text"]))

0.7830188679245284

Not the best, but not the worst🎪