# 1. Import Modul

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score

# 2. Ambil Data

In [2]:
file = "spam.csv"

In [3]:
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

In [7]:
data = pd.read_csv(file)
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


# 3. Preprocessing

In [12]:
x = data[95:96]
print(x.values)
x =np.stack(x.values[0])
x

[['spam'
  'Your free ringtone is waiting to be collected. Simply text the password \\MIX\\" to 85069 to verify. Get Usher and Britney. FML'
  ' PO Box 5249' ' MK17 92H. 450Ppw 16"' nan]]


array(['spam',
       'Your free ringtone is waiting to be collected. Simply text the password \\MIX\\" to 85069 to verify. Get Usher and Britney. FML',
       ' PO Box 5249', ' MK17 92H. 450Ppw 16"', 'nan'], dtype='<U125')

In [13]:
y = "".join(x[1:])
y

'Your free ringtone is waiting to be collected. Simply text the password \\MIX\\" to 85069 to verify. Get Usher and Britney. FML PO Box 5249 MK17 92H. 450Ppw 16"nan'

In [14]:
# # Gabungkan text yang kepisah dalam kolom lain
data = data.fillna("")
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  5572 non-null   object
 3   Unnamed: 3  5572 non-null   object
 4   Unnamed: 4  5572 non-null   object
dtypes: object(5)
memory usage: 217.8+ KB
None


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
# menyatukan data-data yang kepisah
data_arr = data.iloc[:,1:].values
data_arr = list(map(lambda x: x[0]+x[1]+x[2]+x[3],data_arr))
data_arr[95]

'Your free ringtone is waiting to be collected. Simply text the password \\MIX\\" to 85069 to verify. Get Usher and Britney. FML PO Box 5249 MK17 92H. 450Ppw 16"'

In [16]:
data = data.filter(items=["v1","v2"])
data["v2"]=data_arr
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
features = data.v2.values
target = data.v1.values
print(features[:3])
print(target[:3])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
['ham' 'ham' 'spam']


In [18]:
vectorizer = CountVectorizer()
features_vectorizer = vectorizer.fit_transform(features)
print(features_vectorizer[3,:])

  (0, 2818)	1
  (0, 6683)	2
  (0, 7079)	1
  (0, 2839)	1
  (0, 3955)	1
  (0, 1055)	1
  (0, 7695)	1


# 4. Split Data Latih dan Uji

In [21]:
x_train,x_test,y_train,y_test = train_test_split(features_vectorizer,target,test_size=0.2)
print(x_train.shape)
print(x_test.shape)

(4457, 8717)
(1115, 8717)


# 5. Latih dan Prediksi Model

In [22]:
clf = LinearSVC()
model = clf.fit(x_train,y_train)

In [23]:
y_pred = model.predict(x_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [24]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(model.score(x_test,y_test))

[[963   2]
 [ 17 133]]
0.9829596412556054
0.9829596412556054
