In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [2]:
df1 = pd.read_csv(r"C:\TY IT\Sem - 2\ML\Projects\News Article Sorting\entertainment_data.csv")
df2 = pd.read_csv(r"C:\TY IT\Sem - 2\ML\Projects\News Article Sorting\business_data.csv")
df3 = pd.read_csv(r"C:\TY IT\Sem - 2\ML\Projects\News Article Sorting\education_data.csv")
df4 = pd.read_csv(r"C:\TY IT\Sem - 2\ML\Projects\News Article Sorting\sports_data.csv")
df5 = pd.read_csv(r"C:\TY IT\Sem - 2\ML\Projects\News Article Sorting\technology_data.csv")

In [3]:
# Check if 'category' column exists in each dataframe
print(df1.columns)
print(df2.columns)
print(df3.columns)
print(df4.columns)
print(df5.columns)

Index(['headlines', 'description', 'content', 'url', 'category'], dtype='object')
Index(['headlines', 'description', 'content', 'url', 'category'], dtype='object')
Index(['headlines', 'description', 'content', 'url', 'category'], dtype='object')
Index(['headlines', 'description', 'content', 'url', 'category'], dtype='object')
Index(['headlines', 'description', 'content', 'url', 'category'], dtype='object')


In [10]:
df.head()

Unnamed: 0,headlines,description,content,category
0,CUET UG 2023: 73.33% out of 1.49 lakh appear o...,"During the first shift 20,690 students were sc...",CUET UG 2023: University Grants Commission Chi...,education
1,NEET UG 2024: Why is Physics considered a nigh...,Students can overcome their fear of physics an...,— Saurabh Kumar\nPhysics has been known as a n...,education
2,Selena Gomez confirms relationship with produc...,Selena Gomez confirmed that she has been datin...,Here’s a piece of happy news for Selena Gomez’...,entertainment
3,Antonio Conte set to return after Spurs’ FA Cu...,Conte is recovering after undergoing a gall bl...,Tottenham Hotspur boss Antonio Conte is on the...,sports
4,"IIT Madras awards over 2,700 degrees at 60th c...","A total of 2,573 students graduated from the i...","The Indian Institute of Technology, Madras tod...",education


In [4]:
# Ensure all DataFrames have the 'category' column
for df in [df1, df2, df3, df4, df5]:
    if 'category' not in df.columns:
        raise KeyError("One of the DataFrames does not have the 'category' column")

In [5]:
# joining all the dataframes
dfs = [df1, df2, df3, df4, df5]
df = pd.concat(dfs)

In [6]:
# Shuffling the dataframe
from sklearn.utils import shuffle
df = shuffle(df)

In [7]:
# resetting the index
df.reset_index(inplace=True)

In [8]:
# removing the useless columns like index and url
df.drop(columns=["index", "url"], inplace=True, errors='ignore')

In [9]:
# mapping categories to numerical values
def map_category(category):
    category_map = {
        'sports': 0,
        'business': 1,
        'entertainment': 2,
        'education': 3,
        'technology': 4
    }
    return category_map.get(category, -1)

In [29]:
df['category'] = df['Category'].apply(map_category)
df.drop(columns=['Category'],inplace=True)

KeyError: 'Category'

In [9]:
# joining the columns to create one and then dropping the columns
df['Content'] = df['headlines'] + ' ' + df['description'] + ' ' + df['content']
df.drop(['headlines', 'description', 'content'], axis=1, inplace=True)

In [10]:
# Preprocessing the text data
ps = PorterStemmer()

In [11]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df['Content'] = df['Content'].apply(stemming)

In [34]:
# Splitting the data
X = df['Content']
Y = df['Category']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [35]:
# Vectorization
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)

In [36]:
# Function to print evaluation metrics
def evaluate_model(model, X_test, Y_test):
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred, average='weighted')
    report = classification_report(Y_test, Y_pred)
    return accuracy, f1, report

In [37]:
# SVM Model
model_svm = SVC()
model_svm.fit(X_train, Y_train)
accuracy_svm, f1_svm, report_svm = evaluate_model(model_svm, X_test, Y_test)

NameError: name 'SVC' is not defined

In [None]:
print(f"Model: SVM")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"F1-Score: {f1_svm:.4f}")
print(f"Classification Report:\n{report_svm}")
print("="*50)

In [None]:
# Multinomial Naive Bayes Model
model_nb = MultinomialNB()
model_nb.fit(X_train, Y_train)
accuracy_nb, f1_nb, report_nb = evaluate_model(model_nb, X_test, Y_test)

In [None]:
print(f"Model: Multinomial Naive Bayes")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"F1-Score: {f1_nb:.4f}")
print(f"Classification Report:\n{report_nb}")
print("="*50)

In [None]:
# Random Forest Model
model_rf = RandomForestClassifier()
model_rf.fit(X_train, Y_train)
accuracy_rf, f1_rf, report_rf = evaluate_model(model_rf, X_test, Y_test)

In [None]:
print(f"Model: Random Forest")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"Classification Report:\n{report_rf}")
print("="*50)

In [None]:
# Function to map numerical values back to categories
def val_to_category(val):
    category_map = {
        0: 'sports',
        1: 'business',
        2: 'entertainment',
        3: 'education',
        4: 'technology'
    }
    return category_map.get(val, -1)

In [None]:
# Function to make predictions
def make_predictions(headlines, description, content, model):
    text = headlines + " " + description + " " + content
    text = stemming(text)
    text = vc.transform([text])
    val = model.predict(text)
    val = val_to_category(int(val[0]))
    print("News category is:", val)

In [None]:
# Example predictions with SVM model
print("Predictions with SVM model:")
make_predictions("kohli got his 50th century", "kohli has scored his 50th century today at stadium", "the run machine kholi hit another milestone my scoring his 50th odi centry today at stadium")
make_predictions("ambani earns 10M in an hour", "Mukesh Ambani earned 10M rupees in a single hour", "Mukesh Ambani, the chairman of Reliance Industries Limited, has earned around 10M rupees every single hour. This shows how much potential there is in the Indian market.")
make_predictions("Change in NCERT syllabus", "", "")  # as you can see model can even predict with title only

In [None]:
# Example predictions with Multinomial Naive Bayes model
print("Predictions with Multinomial Naive Bayes model:")
make_predictions("kohli got his 50th century", "kohli has scored his 50th century today at stadium", "the run machine kholi hit another milestone my scoring his 50th odi centry today at stadium")
make_predictions("ambani earns 10M in an hour", "Mukesh Ambani earned 10M rupees in a single hour", "Mukesh Ambani, the chairman of Reliance Industries Limited, has earned around 10M rupees every single hour. This shows how much potential there is in the Indian market.")
make_predictions("Change in NCERT syllabus", "", "")  # as you can see model can even predict with title only

In [None]:
# Example predictions with Random Forest model
print("Predictions with Random Forest model:")
make_predictions("kohli got his 50th century", "kohli has scored his 50th century today at stadium", "the run machine kholi hit another milestone my scoring his 50th odi centry today at stadium")
make_predictions("ambani earns 10M in an hour", "Mukesh Ambani earned 10M rupees in a single hour", "Mukesh Ambani, the chairman of Reliance Industries Limited, has earned around 10M rupees every single hour. This shows how much potential there is in the Indian market.")
make_predictions("Change in NCERT syllabus", "", "")  # as you can see model can even predict with title only