In [23]:
import pandas as pd

df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [24]:
#drop the columns that are not needed
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# remove whitespace from the column names
df['v1'] = df['v1'].str.strip()
df['v2'] = df['v2'].str.strip()


In [31]:
df.head()

import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

def remove_non_english_words(text):
    return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words)

df['v2'] = df['v2'].apply(remove_non_english_words)


[nltk_data] Downloading package words to
[nltk_data]     /Users/winnaingkyaw/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [26]:
## Calculate word length and add column length

df['length'] = df['v2'].apply(len)
df.head()

Unnamed: 0,v1,v2,length
0,ham,Go until point crazy Available only in n great...,83
1,ham,lar u,5
2,spam,Free entry in a to win FA Cup final May Text F...,95
3,ham,U dun say so early U c already then say,39
4,ham,I don t think he goes to he around here though,46


In [27]:
## label encode the target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['v1'] = le.fit_transform(df['v1'])
df.head()

Unnamed: 0,v1,v2,length
0,0,Go until point crazy Available only in n great...,83
1,0,lar u,5
2,1,Free entry in a to win FA Cup final May Text F...,95
3,0,U dun say so early U c already then say,39
4,0,I don t think he goes to he around here though,46


In [36]:
# bag of words for v2
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['v2'])
print(vectorizer.get_feature_names_out())

df['matrix'] = list(X.toarray())
df['words'] = [list(filter(lambda x: x[1] > 0, zip(vectorizer.get_feature_names_out(), row))) for row in X.toarray()]
df['word_counts'] = X.toarray().sum(axis=1)
df.head()

['aa' 'abbey' 'abdomen' ... 'zebra' 'zed' 'zoom']


Unnamed: 0,v1,v2,length,matrix,words,word_counts
0,0,Go until point crazy Available only in n great...,83,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(available, 1), (buffet, 1), (cine, 1), (craz...",10
1,0,lar u,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(lar, 1)]",1
2,1,Free entry in a to win FA Cup final May Text F...,95,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(apply, 1), (cup, 1), (entry, 2), (fa, 2), (f...",13
3,0,U dun say so early U c already then say,39,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(dun, 1), (early, 1), (say, 2)]",4
4,0,I don t think he goes to he around here though,46,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(don, 1), (goes, 1), (think, 1)]",3


In [None]:
## Top 5 and Bottom 5 words
word_counts = X.toarray().sum(axis=0)
words_df = pd.DataFrame({'words': vectorizer.get_feature_names_out(), 'count': word_counts})
words_df = words_df.sort_values('count', ascending=False)

top_5_words = words_df.head(5)
bottom_5_words = words_df.tail(5)

print(top_5_words)
print(bottom_5_words)

     words  count
3314    ur    385
1634  just    371
1227  free    284
1666  know    261
1737  like    245
     words  count
1600  jade      1
1598  jack      1
1597   iyo      1
1596  iter      1
3520  zoom      1


In [38]:
## Top 5 and Bottom 5 rows with the most words

df = df.sort_values('word_counts', ascending=False)
top_5_rows = df.head(5)
bottom_5_rows = df.tail(5)

print(top_5_rows)
print(bottom_5_rows)

      v1                                                 v2  length  \
1084   0  For me the love should start with attraction i...     841   
2847   0  Sad story of a Man Last week was my b day My W...     420   
2157   0  Sad story of a Man Last week was my b day My W...     420   
1862   0  The last thing i ever to do was hurt you And i...     672   
3015   0  is fast approaching So Wish u a very Happy New...     309   

                                                 matrix  \
1084  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2847  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2157  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1862  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3015  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                                  words  word_counts  
1084  [(attraction, 1), (beautiful, 2), (breath, 1),...           48  
2847  [(apartment, 1), (bedroom, 1), (boss, 1), (cab...           41  
2157 

In [51]:
### Final df
final_df = df[['v1', 'v2', 'length', 'word_counts', 'words', 'matrix']]


## normalize the length and word_counts columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
final_df[['length', 'word_counts']] = scaler.fit_transform(final_df[['length', 'word_counts']])

final_df.head() 

Unnamed: 0,v1,v2,length,word_counts,words,matrix
1084,0,For me the love should start with attraction i...,1.0,1.0,"('attraction', 1), ('beautiful', 2), ('breath'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2847,0,Sad story of a Man Last week was my b day My W...,0.499405,0.854167,"('apartment', 1), ('bedroom', 1), ('boss', 1),...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2157,0,Sad story of a Man Last week was my b day My W...,0.499405,0.854167,"('apartment', 1), ('bedroom', 1), ('boss', 1),...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1862,0,The last thing i ever to do was hurt you And i...,0.799049,0.854167,"('bad', 1), ('bed', 3), ('choose', 1), ('clean...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3015,0,is fast approaching So Wish u a very Happy New...,0.36742,0.8125,"('afternoons', 1), ('approaching', 1), ('birth...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [54]:
## Split the data into train and test
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

X = final_df[['length', 'word_counts']]
y = final_df['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train a model
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=76)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8663677130044843

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       950
           1       0.57      0.41      0.48       165

    accuracy                           0.87      1115
   macro avg       0.73      0.68      0.70      1115
weighted avg       0.85      0.87      0.86      1115

