In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Spam Classification with AutoML

# 1. Exploratory Data Analysis (EDA)

In [None]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1, 2), stop_words='english')

In [None]:
# read file into pandas using a relative path
sms = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']
sms.head()

In [None]:
sms.groupby('label').describe()
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

In [None]:
sms['message_len'] = sms.message.apply(len)
plt.figure(figsize=(12, 8))

sms[sms.label=='ham'].message_len.plot(bins=35, kind='hist', color='blue', 
                                       label='Ham messages', alpha=0.6)
sms[sms.label=='spam'].message_len.plot(kind='hist', color='red', 
                                       label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")

# 2. Building and evaluating an AutoML model

In [None]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df

In [None]:
import string
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']

In [None]:
!pip install pycaret

In [None]:
from pycaret.nlp import *

%time nlp = setup(data = df, target ='v2', custom_stopwords = STOPWORDS, session_id = 21)
# label_num is the target

In [None]:
%time m1 = create_model(model = 'lda', multi_core = True)

In [None]:
%time lda_data = assign_model(m1)

In [None]:
lda_data.head()

In [None]:
evaluate_model(m1)

In [None]:
lda_data.drop(['v2', 'Dominant_Topic', 'Perc_Dominant_Topic'], axis = 1, inplace = True)
lda_data.head()

In [None]:
from sklearn.model_selection import train_test_split
Train, Test = train_test_split(lda_data, test_size=0.1, random_state=1)

In [None]:
from pycaret.classification import *
%time setup2 = setup(data = Train, target = 'v1', session_id = 5, train_size = 0.9)

In [None]:
%time best_3 = compare_models(sort = 'Accuracy', n_select = 3)

In [None]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

In [None]:
pred_holdout = predict_model(blended)

In [None]:
final_model = finalize_model(blended)

In [None]:
Predictions = predict_model(final_model, data = Test)
Predictions

In [None]:
Result = Predictions[['v1', 'Label']]
Result

In [None]:
model_score = sum(Result['v1'] == Result['Label'])/len(Result)
model_score