In [150]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb 
import matplotlib.pyplot as plt 
import os 
import sys 
from path import Path 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [151]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding = 'latin-1')

In [152]:
df.sample(5)

In [153]:
df.shape

1. Data Cleaning






In [154]:
df.info()

In [155]:
# for drop last 3 column
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.sample(5)

In [156]:
# renaming the column
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [157]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [158]:
df['target'] = encoder.fit_transform(df['target'])
df.head()

In [159]:
# missing values
df.isnull().sum()

In [160]:
# check for duplicate values
df.duplicated().sum()

In [161]:
# remove duplicates
df = df.drop_duplicates(keep='first')
df.duplicated().sum()
df.shape

2.EDA

In [162]:
df.head()

In [163]:
df['target'].value_counts()

In [164]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [165]:
import nltk

In [166]:
!pip install nltk

In [167]:
nltk.download('punkt')

In [168]:
df['num_characters'] = df['text'].apply(len)
df.head()

In [169]:
# num of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df.head()

In [170]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()

In [171]:
df[['num_characters','num_words','num_sentences']].describe()

In [172]:
# ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [173]:
#spam
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [174]:
import seaborn as sns
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [175]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [176]:
sns.pairplot(df,hue='target')

In [177]:
sns.heatmap(df.corr(),annot=True)

3. Data Preprocessing

In [178]:
from nltk.corpus import stopwords

In [179]:
import string

In [180]:
string.punctuation

In [181]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [182]:

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [183]:
df['text'][10]

In [184]:
from nltk.corpus import stopwords
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [185]:
df['transformed_text'] = df['text'].apply(transform_text)

In [186]:
df.head()

In [187]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [188]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [189]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [190]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [191]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [192]:
df.head()

In [193]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [194]:
len(spam_corpus)

In [195]:
from collections import Counter
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [196]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [197]:
len(ham_corpus)

In [198]:
from collections import Counter
sns.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[0],pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [199]:
# Text Vectorization
# using Bag of Words
df.head()

## 4.model creation 

In [200]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [201]:
fitted_tf = tfidf.fit_transform(df.transformed_text.to_numpy())

In [202]:
X = fitted_tf.toarray()

In [203]:
y = df.target.to_numpy()

In [204]:
X_df = pd.DataFrame(X)

In [205]:
for x in ["num_characters","num_words","num_sentences"]:
    X_df[x] = None 
    X_df[x] = df[x]

In [206]:
y_df = df.target

In [207]:
X_train,X_test , y_train, y_test = train_test_split(X_df, y_df)

In [208]:
X_train.shape, y_train.shape , X_test.shape,y_test.shape

In [209]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

In [210]:
parameters = {'objective': 'binary',
              'metric':'auc',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 63,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.01,
              'verbose': -1
             }

In [211]:
model_lgbm = lgb.train(parameters,
                            train_data,
                            valid_sets=valid_data,
                            num_boost_round=5000,
                            early_stopping_rounds=50)

In [212]:
y_train_pred = model_lgbm.predict(X_train)
y_valid_pred = model_lgbm.predict(X_test)

print("AUC Train: {:.4f}\nAUC Valid: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_valid_pred)))

In [213]:
testing_text  =  "England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ì¼1.20 POBOXox36504W45WQ 16+"
testing_vector = tfidf.transform([testing_text])
no_characters = len(testing_text)
no_words = len(nltk.word_tokenize(x))
no_sentences = len(nltk.sent_tokenize(x))

In [214]:
x

In [215]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding = 'latin-1')

In [216]:
testing_text_df = pd.DataFrame(testing_vector.toarray())
testing_text_df['no_words'] = no_words
testing_text_df['no_characters'] =  no_characters
testing_text_df['no_sentences'] = no_sentences

In [217]:
def show_output(pred):
   
    if pred < 0.5: 
        print(f"message detected as { encoder.classes_[0]}")
    else: 
        print(f"message detected as { encoder.classes_[1]}")


In [218]:
show_output(model_lgbm.predict(testing_text_df.to_numpy())[0])

In [219]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection")

model = AutoModelForSequenceClassification.from_pretrained("mrm8488/bert-tiny-finetuned-sms-spam-detection")

In [220]:
import torch 

In [221]:
dict_samp = tokenizer(" England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ì¼1.20 POBOXox36504W45WQ 16+")
outputs = model(torch.unsqueeze(torch.tensor(dict_samp['input_ids']), dim = 0 ), torch.unsqueeze(torch.tensor(dict_samp['attention_mask']),0))

In [222]:
show_output(outputs.logits.argmax().numpy())

In [223]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(model_lgbm,open('model.pkl','wb'))