In [1]:
import numpy as np
import pandas as pd

In [3]:
file= 'sms_spam.csv'
df = pd.read_csv(file)

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 606-607: invalid continuation byte

In [4]:
# To solve the Unicode Decode Error
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

In [5]:
# Copy the encoding type and manually set it to read the file
df = pd.read_csv(file,encoding='Windows-1252')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [7]:
# Un-cleaned data shape
df.shape

(5572, 5)

## Data Cleaning

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [9]:
# Most of the values in columns Unnamed: 2,3,4 are null values
# We will drop last 3 columns

df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [10]:
df.sample(5)

Unnamed: 0,v1,v2
817,ham,Also are you bringing galileo or dobby
864,ham,Dude ive been seeing a lotta corvettes lately
2844,ham,Will be out of class in a few hours. Sorry
2631,spam,"URGENT! Your mobile No 077xxx WON a å£2,000 Bo..."
3610,ham,No. I.ll meet you in the library


In [11]:
# Column names are not self-explanatory
# Renaming the cols

df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
1783,ham,No dear i do have free messages without any re...
2938,ham,Are you ok. What happen to behave like this
3247,ham,Also track down any lighters you can find
3405,ham,\HEY DAS COOL... IKNOW ALL 2 WELLDA PERIL OF S...
4609,ham,Just glad to be talking to you.


In [12]:
# In the dataset 'ham' means Legitimate and 'spam' means false so, we will encode ham as 0 and spam as 1

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [13]:
df['target'] = encoder.fit_transform(df['target'])

In [14]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
# Looking for any missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [16]:
# check for duplicate values
df.duplicated().sum()

403

In [17]:
# remove duplicates
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

In [18]:
df.shape

(5169, 2)

In [19]:
df.to_csv('cleaned_sms_spam.csv')