In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [22]:
df = pd.read_csv("SMSSpamCollection",  names=["label", "text"], delimiter="\t")
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [23]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
#stopwords.words('english')
import string
#string.punctuation
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [24]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
       if i.isalnum():
           y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return ' '.join(y)

In [25]:
df['text'].apply(transform_text)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u pound prize 2 claim e...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object

In [28]:
df['text'] = df['text'].apply(transform_text)

In [29]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [30]:
df

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though
...,...,...
5567,1,2nd time tri 2 contact u pound prize 2 claim e...
5568,0,ü b go esplanad fr home
5569,0,piti mood suggest
5570,0,guy bitch act like interest buy someth el next...


In [31]:
df['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [32]:
y = pd.DataFrame(df["label"])
X = pd.DataFrame(df["text"])

In [33]:
X_train, X_combine, y_train, y_combine = train_test_split(
    X, y, test_size=0.30)
X_test, X_val, y_test, y_val = train_test_split(
    X_combine, y_combine, test_size=0.5)

In [34]:
train = pd.concat([y_train, X_train], axis=1)
test =pd.concat([y_test,X_test],axis=1)
validation=pd.concat([y_val,X_val],axis=1)

In [35]:
train.to_csv("train.csv")
test.to_csv("test.csv")
validation.to_csv("validation.csv")