In [None]:
# Load Data
import pandas as pd
df = pd.read_csv('../data/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

In [None]:
# Check and clean missing values
df.isnull().sum()
df.dropna(inplace=True)


In [None]:
# Feature Engineering
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
df['message_length'] = df['message'].apply(len)


In [None]:
# Summary Stats
df.describe()
df.groupby('label').describe()


In [None]:
# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df, x='label')
plt.title("Label Distribution")
plt.show()


In [None]:
# Message Length Boxplot
sns.boxplot(data=df, x='label', y='message_length')
plt.title("Message Length by Label")
plt.show()


In [None]:
# Word Clouds
from wordcloud import WordCloud

spam_words = ' '.join(df[df.label=='spam']['message'])
ham_words = ' '.join(df[df.label=='ham']['message'])

WordCloud(background_color='white').generate(spam_words).to_image().show()
WordCloud(background_color='white').generate(ham_words).to_image().show()


In [None]:
# Save cleaned data
df.to_csv('../outputs/cleaned_data.csv', index=False)
