## Importing Pandas

In [1]:
import pandas as pd

## Loading Dataset

In [2]:
df = pd.read_csv("spam.csv", encoding='latin1')
df.head(5)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['spam'] = df['type'].map({"spam":1, "ham":0}).astype(int)
df.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Tokenisation

In [4]:
df['text'][1]#before

'Ok lar... Joking wif u oni...'

In [5]:
df["text"] = df["text"].apply(lambda x: x.split())

In [6]:
df['text'][1]#after

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

## Stemming

In [7]:
df['text'][1]#before

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [8]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [9]:
df["text"] = df["text"].apply(lambda x: [porter.stem(word) for word in x])

In [10]:
df['text'][1]#after

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

## Lemmitization

In [11]:
df['text'][29]#before

['ahhh.',
 'work.',
 'i',
 'vagu',
 'rememb',
 'that!',
 'what',
 'doe',
 'it',
 'feel',
 'like?',
 'lol']

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
df["text"] = df["text"].apply(lambda x : [lemmatizer.lemmatize(word, pos="a") for word in x])

In [14]:
df['text'][29]#after

['ahhh.',
 'work.',
 'i',
 'vagu',
 'rememb',
 'that!',
 'what',
 'doe',
 'it',
 'feel',
 'like?',
 'lol']

## Stopword Removal

In [15]:
df['text'][39]#before

['hello!',
 'how',
 'you',
 'and',
 'how',
 'did',
 'saturday',
 'go?',
 'i',
 'was',
 'just',
 'text',
 'to',
 'see',
 'if',
 "you'd",
 'decid',
 'to',
 'do',
 'anyth',
 'tomo.',
 'not',
 'that',
 "i'm",
 'tri',
 'to',
 'invit',
 'myself',
 'or',
 'anything!']

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [17]:
df["text"] = df["text"].apply(lambda x : [word for word in x if not word in stop_words]).apply(' '.join)
df['text'][39]#after

"hello! saturday go? text see decid anyth tomo. i'm tri invit anything!"

In [18]:
df.head(10)

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0
5,spam,freemsg hey darl 3 week word back! i'd like fu...,1
6,ham,even brother like speak me. treat like aid pat...,0
7,ham,per request mell mell (oru minnaminungint nuru...,0
8,spam,winner!! valu network custom select receivea â...,1
9,spam,mobil 11 month more? u r entitl updat late col...,1


## Transforming Text Data into TF-IDF Vectors

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [20]:
tfidf = TfidfVectorizer()
y = df.spam.values
x = tfidf.fit_transform(df["text"])
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1, test_size = 0.2, shuffle = False)

## Classification using Linear SVC

In [21]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [22]:
model = LinearSVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc_model = accuracy_score(y_pred, y_test)*100
print("accuracy :", acc_model)

accuracy : 87.5


## Classification using Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc_model = accuracy_score(y_pred, y_test)*100
print("accuracy :", acc_model)

accuracy : 87.5
