# Category classification — focused walkthrough
This notebook demonstrates building a compact, reproducible pipeline to classify article categories using TF–IDF features and a linear classifier. The goal is to show a clear, machine-readable workflow: load the provided CSVs, perform minimal preprocessing (lowercasing, punctuation removal, and stopword filtering), extract TF–IDF features, train a lightweight classifier, and report accuracy and a confusion matrix.

Design notes:
- Use a fixed random seed for reproducibility.
- Keep preprocessing simple so the notebook runs quickly on a laptop.
- Provide both a numeric table and a visual (heatmap) for the confusion matrix.

Files used: `data/True.csv`, `data/Fake.csv` (already present in the repo).


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
# Load dataset (correct path)
news = pd.read_csv('data/bbc-text.csv')
news.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
news.shape

(2635, 2)

In [5]:
from sklearn.utils import shuffle
news = shuffle(news)
news = news.reset_index(drop=True)

In [6]:
news.head()

Unnamed: 0,category,text
0,sport,mcleish ready for criticism rangers manager al...
1,politics,blair hails turkey-eu talks deal tony blair ha...
2,crime,The Mental Illness of Murder
3,politics,brown s poll campaign move denied the governme...
4,crime,"Cassandra Elfers, Middle School Volleyball Coa..."


In [7]:
news=news.astype(str)
news.dropna()

Unnamed: 0,category,text
0,sport,mcleish ready for criticism rangers manager al...
1,politics,blair hails turkey-eu talks deal tony blair ha...
2,crime,The Mental Illness of Murder
3,politics,brown s poll campaign move denied the governme...
4,crime,"Cassandra Elfers, Middle School Volleyball Coa..."
...,...,...
2630,business,bad weather hits nestle sales a combination of...
2631,politics,best person for top legal job the best perso...
2632,entertainment,rock band u2 break ticket record u2 have smash...
2633,sport,dogged federer claims dubai crown world number...


In [8]:
news['text'] = news['text'].apply(lambda x: x.lower())
news.head()

Unnamed: 0,category,text
0,sport,mcleish ready for criticism rangers manager al...
1,politics,blair hails turkey-eu talks deal tony blair ha...
2,crime,the mental illness of murder
3,politics,brown s poll campaign move denied the governme...
4,crime,"cassandra elfers, middle school volleyball coa..."


In [9]:
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

news['text'] = news['text'].apply(punctuation_removal)

In [10]:
news.head()

Unnamed: 0,category,text
0,sport,mcleish ready for criticism rangers manager al...
1,politics,blair hails turkeyeu talks deal tony blair has...
2,crime,the mental illness of murder
3,politics,brown s poll campaign move denied the governme...
4,crime,cassandra elfers middle school volleyball coac...


In [11]:
# Removing stopwords
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

news['text'] = news['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [12]:
news.head()

Unnamed: 0,category,text
0,sport,mcleish ready criticism rangers manager alex m...
1,politics,blair hails turkeyeu talks deal tony blair hai...
2,crime,mental illness murder
3,politics,brown poll campaign move denied government den...
4,crime,cassandra elfers middle school volleyball coac...


In [13]:
# Function to plot the confusion matrix (code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html)
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [14]:
X_train,X_test,y_train,y_test = train_test_split(news['text'], news.category, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression
from newspaper import*
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

# Fitting the model
model = pipe.fit(X_train, y_train)

# url="https://timesofindia.indiatimes.com/city/indore/indore-last-2-in-munawar-case-get-bail-after-8-weeks-in-jail/articleshow/81238874.cms"
# article = Article(str(url))
# article.download()
# article.parse()
# article.nlp()

# t1=tfidf_vectorizer([article.summary])
# p=model.predict([article.summary])
# print(p)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 98.1%


In [16]:
import pickle
final1 = 'model.sav'
pickle.dump(model, open('final1', 'wb'))