In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download(["stopwords","wordnet"])
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer as wn
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,cross_val_score

# Data Preporcessing 

In [None]:
data=pd.read_csv("/kaggle/input/sms-spam-collection/SpamCollectionSMS.txt",sep="\t",names=["label","text"])

In [None]:
data.head()

In [None]:
data.label.value_counts()

In [None]:
data.isnull().any()

### Cleaning text column

In [None]:
def remove_punc(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text
data["clean_text"]=data.text.apply(remove_punc)

In [None]:
data.head()

In [None]:
def tokenize(text):
    tokens=re.split("\W+",text.lower())
    return tokens
data.clean_text=data.clean_text.apply(tokenize)

In [None]:
data.head()

In [None]:
def remove_stopwords(text):
    token=[word for word in text if word not in stopwords.words("english")]
    return token
data.clean_text=data.clean_text.apply(remove_stopwords)

In [None]:
data.head()

In [None]:
# for kaggle unziping wordnet file
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
def lematizing(text):
    token=[wn().lemmatize(word) for word in text]
    token=" ".join(token)
    return token
data.clean_text=data.clean_text.apply(lematizing)

In [None]:
data.head()

###Vectorizing text column

In [None]:

cv1=TfidfVectorizer(ngram_range=(1,2))
x_count=cv1.fit_transform(data.clean_text)
print(x_count.shape)
cv1.get_feature_names_out()

In [None]:
df=pd.DataFrame(x_count.toarray())
df.columns=cv1.get_feature_names_out()
df.head()

# Feature Engineering

In [None]:
data["length"]=data.text.apply(lambda x:len(x)-x.count(" "))
data.head()

In [None]:
def punctuations(data):
  count=0
  for char in data.text:
    if char in string.punctuation:
      count+=1
  return count/data.length*100
data["punc_percent"]=data.apply(punctuations,axis=1)

In [None]:
data.head()

# EDA

In [None]:
bins=np.linspace(0,200,50)
plt.hist(data[data.label=="spam"]['length'],bins,alpha=.5,density=True,label="spam")
plt.hist(data[data.label=="ham"]['length'],bins,alpha=.5,density=True,label="ham")
plt.legend()
plt.show()

In [None]:
bins=np.linspace(0,20,40)
plt.hist(data[data.label=="spam"]['punc_percent'],bins,alpha=.5,density=True,label="spam")
plt.hist(data[data.label=="ham"]['punc_percent'],bins,alpha=.5,density=True,label="ham")
plt.legend()
plt.show()

### Box Cox transformation

In [None]:
for i in [1/2,1,2,3,4,5,6]:
    plt.hist((data["punc_percent"])**(1/i),bins=40)
    plt.show()

In [None]:
data.punc_percent=(data["punc_percent"])**(1/3)

# Model training

In [None]:
y=data.label
x=pd.concat([data[["length","punc_percent"]],df],axis=1)
x

In [None]:
rf=RandomForestClassifier(n_jobs=-1,random_state=1)
k_fold=KFold(n_splits=5)
cross_val_score(rf,x,y,cv=k_fold,scoring="accuracy",n_jobs=-1)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y)
rf_model=rf.fit(x_train,y_train)

In [None]:
sorted(zip(rf_model.feature_importances_,x.columns),reverse=True)[:20]

In [None]:
y_pred=rf_model.predict(x_test)
precision,recall,fscore,support=score(y_test,y_pred,labels=["spam"])

In [None]:
float(precision),float(recall),float(fscore),float(support)

In [None]:
print("Accuracy: {}".format((y_pred==y_test).sum()/len(y_pred)))