# Basic Text Classification Project

## 1. Import Required Libraries
We will use pandas, numpy, scikit-learn, matplotlib, and nltk for data processing, modeling, and visualization.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
import string
import re
nltk.download('stopwords')
from nltk.corpus import stopwords

## 2. Load and Explore Dataset
We will use the SMS Spam Collection Dataset. Let's load the data and explore its structure.

In [None]:
# Download and load the SMS Spam Collection Dataset
dataset_url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
df = pd.read_csv(dataset_url, sep='\t', header=None, names=['label', 'text'])

df.head()

In [None]:
# Dataset shape and class distribution
print('Dataset shape:', df.shape)
print('\nClass distribution:')
print(df['label'].value_counts())
df['label'].value_counts().plot(kind='bar', title='Class Distribution')
plt.show()

## 3. Text Cleaning (Lowercase, Remove Punctuation)
We will clean the text by converting to lowercase and removing punctuation.

In [None]:
# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].head()

## 4. Remove Stopwords
We will remove common stopwords from the cleaned text.

In [None]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

df['clean_text_nostop'] = df['clean_text'].apply(remove_stopwords)
df[['clean_text', 'clean_text_nostop']].head()

## 5. Text Vectorization: CountVectorizer
Convert the cleaned text into numerical features using CountVectorizer.

In [None]:
count_vect = CountVectorizer()
X_count = count_vect.fit_transform(df['clean_text_nostop'])
print('CountVectorizer shape:', X_count.shape)

## 6. Text Vectorization: TF-IDF
Convert the cleaned text into numerical features using TfidfVectorizer.

In [None]:
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(df['clean_text_nostop'])
print('TF-IDF shape:', X_tfidf.shape)

## 7. Train-Test Split
Split the dataset into training and testing sets for model evaluation.

In [None]:
y = df['label']

Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_count, y, test_size=0.2, random_state=42, stratify=y)
Xt_train, Xt_test, yt_train, yt_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

print('Train set size:', Xc_train.shape[0])
print('Test set size:', Xc_test.shape[0])

## 8. Model Training and Evaluation: Naive Bayes
Train a Naive Bayes classifier and evaluate using accuracy, confusion matrix, and classification report.

In [None]:
# Naive Bayes with CountVectorizer
nb_count = MultinomialNB()
nb_count.fit(Xc_train, yc_train)
yc_pred = nb_count.predict(Xc_test)

print('Naive Bayes (CountVectorizer)')
print('Accuracy:', accuracy_score(yc_test, yc_pred))
print('Confusion Matrix:\n', confusion_matrix(yc_test, yc_pred))
print('Classification Report:\n', classification_report(yc_test, yc_pred))

# Naive Bayes with TF-IDF
nb_tfidf = MultinomialNB()
nb_tfidf.fit(Xt_train, yt_train)
yt_pred = nb_tfidf.predict(Xt_test)

print('Naive Bayes (TF-IDF)')
print('Accuracy:', accuracy_score(yt_test, yt_pred))
print('Confusion Matrix:\n', confusion_matrix(yt_test, yt_pred))
print('Classification Report:\n', classification_report(yt_test, yt_pred))

## 9. Model Training and Evaluation: Logistic Regression
Train a Logistic Regression classifier and evaluate its performance.

In [None]:
# Logistic Regression with CountVectorizer
lr_count = LogisticRegression(max_iter=1000)
lr_count.fit(Xc_train, yc_train)
yc_pred_lr = lr_count.predict(Xc_test)

print('Logistic Regression (CountVectorizer)')
print('Accuracy:', accuracy_score(yc_test, yc_pred_lr))
print('Confusion Matrix:\n', confusion_matrix(yc_test, yc_pred_lr))
print('Classification Report:\n', classification_report(yc_test, yc_pred_lr))

# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(Xt_train, yt_train)
yt_pred_lr = lr_tfidf.predict(Xt_test)

print('Logistic Regression (TF-IDF)')
print('Accuracy:', accuracy_score(yt_test, yt_pred_lr))
print('Confusion Matrix:\n', confusion_matrix(yt_test, yt_pred_lr))
print('Classification Report:\n', classification_report(yt_test, yt_pred_lr))

## 10. Model Training and Evaluation: Support Vector Machine (Optional)
Optionally, train a Support Vector Machine (SVM) classifier and evaluate its performance.

In [None]:
# SVM with CountVectorizer
svm_count = LinearSVC(max_iter=2000)
svm_count.fit(Xc_train, yc_train)
yc_pred_svm = svm_count.predict(Xc_test)

print('SVM (CountVectorizer)')
print('Accuracy:', accuracy_score(yc_test, yc_pred_svm))
print('Confusion Matrix:\n', confusion_matrix(yc_test, yc_pred_svm))
print('Classification Report:\n', classification_report(yc_test, yc_pred_svm))

# SVM with TF-IDF
svm_tfidf = LinearSVC(max_iter=2000)
svm_tfidf.fit(Xt_train, yt_train)
yt_pred_svm = svm_tfidf.predict(Xt_test)

print('SVM (TF-IDF)')
print('Accuracy:', accuracy_score(yt_test, yt_pred_svm))
print('Confusion Matrix:\n', confusion_matrix(yt_test, yt_pred_svm))
print('Classification Report:\n', classification_report(yt_test, yt_pred_svm))

## 11. Compare CountVectorizer vs TF-IDF Results
Compare the results (accuracy, confusion matrix, classification report) of models trained with CountVectorizer and TF-IDF features.

In [None]:
# Collect accuracy scores for comparison
results = pd.DataFrame({
    'Model': ['Naive Bayes', 'Logistic Regression', 'SVM'],
    'CountVectorizer': [
        accuracy_score(yc_test, nb_count.predict(Xc_test)),
        accuracy_score(yc_test, lr_count.predict(Xc_test)),
        accuracy_score(yc_test, svm_count.predict(Xc_test))
    ],
    'TF-IDF': [
        accuracy_score(yt_test, nb_tfidf.predict(Xt_test)),
        accuracy_score(yt_test, lr_tfidf.predict(Xt_test)),
        accuracy_score(yt_test, svm_tfidf.predict(Xt_test))
    ]
})
results.set_index('Model').plot(kind='bar', ylim=(0.9,1.0), title='Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()
results

## 12. Show Top Important Words for Each Class
Display the most important words for each class based on model coefficients or feature importances.

In [None]:
def show_top_words(model, vectorizer, n=10):
    feature_names = np.array(vectorizer.get_feature_names_out())
    for i, class_label in enumerate(model.classes_):
        if hasattr(model, 'coef_'):
            top = np.argsort(model.coef_[i])[-n:]
            print(f"Top words for class '{class_label}' (coef):", feature_names[top][::-1])
        elif hasattr(model, 'feature_log_prob_'):
            top = np.argsort(model.feature_log_prob_[i])[-n:]
            print(f"Top words for class '{class_label}' (log prob):", feature_names[top][::-1])
        print()

print('Naive Bayes (CountVectorizer):')
show_top_words(nb_count, count_vect)
print('Logistic Regression (CountVectorizer):')
show_top_words(lr_count, count_vect)
print('Naive Bayes (TF-IDF):')
show_top_words(nb_tfidf, tfidf_vect)
print('Logistic Regression (TF-IDF):')
show_top_words(lr_tfidf, tfidf_vect)