In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## 1.) Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 2.) Load the dataset 

In [None]:
df_train = pd.read_csv("/kaggle/input/spam-sms-classification/TrainDataset.csv")
df_test = pd.read_csv("/kaggle/input/spam-sms-classification/TestDataset.csv")

In [None]:
train = df_train
test = df_test

In [None]:
df_train.head()

In [None]:
df_train = df_train.rename(columns = {'v1':'class_label','v2':'message'})
df_test = df_test.rename(columns = {'v2':'message'})

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
class_label = df_train['class_label']

In [None]:
df_train.info()

## 3.) Exploratory Data Analysis

In [None]:
df_train.groupby('class_label').describe()

In [None]:
df_train['length'] = df_train['message'].apply(len)
df_train.head()

In [None]:
df_train.hist(column='length', by='class_label', bins=50,figsize=(11,5))
plt.show()

### Distribution of Target Variable

In [None]:
df_train.class_label.value_counts()

In [None]:
df_train.class_label.value_counts(normalize=True)

In [None]:
f,ax=plt.subplots(1,2, figsize=(12,4))
df_train.class_label.value_counts().plot.pie(explode=[0,0.12],autopct='%1.3f%%',ax=ax[0])
sns.countplot('class_label',data=df_train)
plt.show()

#### So, the dataset is imbalanced with respect to target variable.

## 4.) Preprocessing the data

In [None]:
df_train.drop('class_label',axis=1,inplace=True)
df_train.drop('length',axis=1,inplace=True)

In [None]:
## joined df_train is combination of train and test
df_train = df_train.append(df_test)

In [None]:
# store the SMS message
sms = df_train.message
sms.head()

### Using *regular expression* to replace email address, urls, phone number, money

In [None]:
# Replace email address with 'emailaddress'
final_sms = sms.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

In [None]:
# Replace urls with 'webaddress'
final_sms = final_sms.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

In [None]:
# Replace money symbol with 'money-symbol'
final_sms = final_sms.str.replace(r'£|\$', 'money-symbol')

In [None]:
# Replace 10 digit phone number with 'phone-number'
final_sms = final_sms.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')

In [None]:
# Replace normal number with 'number'
final_sms = final_sms.str.replace(r'\d+(\.\d+)?', 'number')

In [None]:
# remove punctuation
final_sms = final_sms.str.replace(r'[^\w\d\s]', ' ')

In [None]:
# remove whitespace between terms with single space
final_sms = final_sms.str.replace(r'\s+', ' ')

In [None]:
# remove leading and trailing whitespace
final_sms = final_sms.str.replace(r'^\s+|\s*?$', ' ')

In [None]:
# change words to lower case
final_sms = final_sms.str.lower()

In [None]:
final_sms

#### NLTK 

In [None]:
import nltk

In [None]:
# remove stop words from SMS
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
final_sms = final_sms.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
# remove word stems using Porter stemmer
import nltk
ps = nltk.PorterStemmer()
final_sms = final_sms.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [None]:
final_sms

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
# creating a bag-of-words
all_words = []
for sms in final_sms:
    words = word_tokenize(sms)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)        

In [None]:
# print total number of words
print('Number of words: {}'.format(len(all_words)))

In [None]:
# print 10 most common words
print('10 most common words: {}'.format(all_words.most_common(10)))

In [None]:
temp = []
for (i,j) in all_words.most_common(1200):
    temp.append(i)

In [None]:
# use the top 1200 most common words as features
temp

#### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_model=TfidfVectorizer()
tfidf_vec=tfidf_model.fit_transform(final_sms)
tfidf_data=pd.DataFrame(tfidf_vec.toarray())
tfidf_data.head()

### Seprating Columns

In [None]:
train.shape

In [None]:
test.shape

In [None]:
df_test = tfidf_data.iloc[-1115:]

In [None]:
df_train = tfidf_data.iloc[:4457]

In [None]:
df_train['class_label'] = class_label

In [None]:
df_train.shape

In [None]:
df_train

## 5.) Model Building

In [None]:
X = df_train.drop('class_label',axis=1)
Y = class_label

In [None]:
# splitting training data into train and validation using sklearn
from sklearn import model_selection
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=.2, random_state=42)

In [None]:
print(len(X_train))
print(len(X_test))

### Import Evaluation metric

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_mod = RandomForestClassifier(n_estimators=50,max_depth=12, random_state=101,
                             class_weight='balanced',verbose=1,n_jobs=-1)

In [None]:
rfc_mod.fit(X_train,y_train)

In [None]:
y_pred_rfc = rfc_mod.predict(X_test)

In [None]:
y_pred_rfc 

In [None]:
print("F1 Score :",f1_score(y_pred_rfc,y_test,average = "weighted"))
print('Report:\n',classification_report(y_test, y_pred_rfc))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred_rfc))

In [None]:
rfc_predicted = rfc_mod.predict(df_test)

In [None]:
test_predicted= pd.DataFrame()
test_predicted['class_label'] = rfc_predicted
test_predicted.to_csv('test_predicted_rfc.csv',index=False)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dectre_mod = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=5).fit(X_train, y_train)

In [None]:
dectre_mod.fit(X_train,y_train)

In [None]:
y_pred_dectre = dectre_mod.predict(X_test)

In [None]:
y_pred_dectre

In [None]:
print("F1 Score :",f1_score(y_pred_dectre,y_test,average = "weighted"))
print('Report:\n',classification_report(y_test, y_pred_dectre))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred_dectre))

In [None]:
dectre_predicted = dectre_mod.predict(df_test)

In [None]:
test_predicted= pd.DataFrame()
test_predicted['class_label'] = dectre_predicted
test_predicted.to_csv('test_predicted_DecisionTree.csv',index=False)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg_mod = LogisticRegression(random_state=42).fit(X_train, y_train)

In [None]:
y_pred_logreg = logreg_mod.predict(X_test)

In [None]:
y_pred_logreg

In [None]:
print("F1 Score :",f1_score(y_pred_logreg,y_test,average = "weighted"))
print('Report:\n',classification_report(y_test, y_pred_logreg))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_pred_logreg))

In [None]:
predicted_test = logreg_mod.predict(df_test)

In [None]:
test_predicted = pd.DataFrame()
test_predicted['class_label'] = predicted_test
test_predicted.to_csv('test_predicted_logisticRgrsn.csv',index=False)