1. Load the BBC dataset.

In [211]:
# Your code here

import pandas as pd

# Load the BBC news corpus 
df = pd.read_csv('bbc-text.csv')

# Display the first few rows
df.head(4)



Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...


2. Pre-process the data if necessary.

In [212]:
#Your code here

import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import nltk

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Check for missing values and ensure column is not missing
df.dropna(subset=['text', 'category'], inplace=True) 

# Convert text to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Tokenize and lemmatize the text
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x.split()])

# Encode the labels
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Display the first few rows of the preprocessed data
print(df.head(4))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarinous/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarinous/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   category                                               text  \
0      tech  tv future hands viewers home theatre systems p...   
1  business  worldcom boss left books alone former worldcom...   
2     sport  tigers wary farrell gamble leicester say rushe...   
3     sport  yeading face newcastle fa cup premiership side...   

                                              tokens  category_encoded  
0  [tv, future, hand, viewer, home, theatre, syst...                 4  
1  [worldcom, bos, left, book, alone, former, wor...                 0  
2  [tiger, wary, farrell, gamble, leicester, say,...                 3  
3  [yeading, face, newcastle, fa, cup, premiershi...                 3  


3. Split the data into training and test.

In [213]:
# Your code here

from sklearn.model_selection import train_test_split

# Encode the 'category' column into numerical labels
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Define features (X) and target (y)
X = df['text']  #raw text
y = df['category_encoded']  #encoded categories

# Split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the splits
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

print()

# View the mapping of categories to encoded values
print("Category Encoding Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Training set size: 1780
Test set size: 445

Category Encoding Mapping:
{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


4. Create the first topic classifier. Include a vectorizer if necessary.

In [214]:
#Your code here

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) 

# Create the Logistic Regression model
model = LogisticRegression(max_iter=1000) 


5. Report on training metrics of the first classifier. Use the classification_report function.

In [215]:
# Your code here

from sklearn.metrics import classification_report

# Train the classifier (Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict the categories for the training data
y_train_pred = clf.predict(X_train_tfidf)

# Generate the classification report for the training data
train_report = classification_report(y_train, y_train_pred, target_names=label_encoder.classes_)

# Print the classification report for the training data 
print("Training Classification Report:\n", train_report)


Training Classification Report:
                precision    recall  f1-score   support

     business       1.00      1.00      1.00       409
entertainment       1.00      1.00      1.00       305
     politics       0.99      0.99      0.99       334
        sport       1.00      1.00      1.00       413
         tech       0.99      1.00      0.99       319

     accuracy                           1.00      1780
    macro avg       1.00      1.00      1.00      1780
 weighted avg       1.00      1.00      1.00      1780



6. Report on test metrics of the first classifier. Use the classification_report function.

In [216]:
#Your code here

from sklearn.metrics import classification_report

# Predict the categories for the test data
y_test_pred = clf.predict(X_test_tfidf)

# Generate the classification report for the test data
test_report = classification_report(y_test, y_test_pred, target_names=label_encoder.classes_)

# Print the classification report for the test data 
print("Test Classification Report:\n", test_report)


Test Classification Report:
                precision    recall  f1-score   support

     business       0.93      0.94      0.94       101
entertainment       1.00      0.94      0.97        81
     politics       0.94      0.98      0.96        83
        sport       0.98      1.00      0.99        98
         tech       0.98      0.96      0.97        82

     accuracy                           0.96       445
    macro avg       0.97      0.96      0.96       445
 weighted avg       0.96      0.96      0.96       445



7. Create the second topic classifier. Include a vectorizer if necessary.

In [217]:
#Your code here 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Create the Support Vector Classifier with a pipeline
svm_classifier = make_pipeline(vectorizer, SVC(kernel='linear'))


8. Report on the training metrics of the second classifier.

In [218]:
#Your code here

from sklearn.metrics import classification_report

# Fit the pipeline with training data
svm_classifier.fit(X_train, y_train)

# Predict the categories for the training data
y_train_pred_svm = svm_classifier.predict(X_train)

# Generate the classification report for the training data
train_report_svm = classification_report(y_train, y_train_pred_svm, target_names=label_encoder.classes_)

# Print the classification report for the training data 
print("Training Classification Report for Second Classifier (SVM):\n", train_report_svm)



Training Classification Report for Second Classifier (SVM):
                precision    recall  f1-score   support

     business       1.00      1.00      1.00       409
entertainment       1.00      1.00      1.00       305
     politics       1.00      1.00      1.00       334
        sport       1.00      1.00      1.00       413
         tech       1.00      1.00      1.00       319

     accuracy                           1.00      1780
    macro avg       1.00      1.00      1.00      1780
 weighted avg       1.00      1.00      1.00      1780



9. Report on the test metrics of the second classifier.

In [219]:
#Your code here

from sklearn.metrics import classification_report

#Predict the categories for the test data
y_test_pred_svm = svm_classifier.predict(X_test)

#Generate the classification report for the test data
test_report_svm = classification_report(y_test, y_test_pred_svm, target_names=label_encoder.classes_)

#Print the classification report for the test data 
print("Test Classification Report for Second Classifier (SVM):\n", test_report_svm)


Test Classification Report for Second Classifier (SVM):
                precision    recall  f1-score   support

     business       0.98      0.93      0.95       101
entertainment       1.00      0.98      0.99        81
     politics       0.93      0.99      0.96        83
        sport       0.98      1.00      0.99        98
         tech       0.99      0.99      0.99        82

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



10. How do the two classifiers compare in metrics? Do they overfit?


In [220]:
#Both classifiers (Logistic Regression and SVM) show overfitting, 
#with perfect performance on the training set (1.00 precision, recall, and accuracy) but a slight drop in performance on the test set.

#Classifier 1 achieves 0.96 accuracy on the test data, while Classifier 2 (SVM) performs slightly better with 0.98 accuracy. 

#Despite the drop in accuracy, both classifiers maintain strong precision, recall, and F1-scores, indicating good generalization to new data.

#Overall, the SVM model outperforms the first classifier with slightly better test accuracy and overall metrics.