In [1]:
# @title Imports
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from nltk.tag import pos_tag
from nltk import word_tokenize
import matplotlib.pyplot as plt
# from nltk.chunk import ne_chunk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
import string
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [5]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Shruti-Project/india-news-headlines.csv')
df

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic
...,...,...,...
3876552,20230630,city.goa,10 PIs move HC over thwarted seniority
3876553,20230630,city.goa,Govt notifies award in memory of Parrikar for ...
3876554,20230630,city.goa,After youth's death; PWD installs crash barrie...
3876555,20230630,city.goa,Authorities not acting against CRZ violations


In [6]:
# @title Categories Cleaning

keywords = ['india', 'city', 'mumbai', 'goa', 'delhi', 'uttar pradesh']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'India'

keywords = ['entertainment', 'bollywood', 'hollywood', 'movie', 'music']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'entertainment'

keywords = ['world', 'us']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'world'

keywords = ['tech', 'technology', 'iit', 'cars', 'auto', 'electronics', 'bikes']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'tech'

keywords = ['sports', 'cricket', 'tennis', 'hockey', 'commonwealth', 'games', 'gaming', 'olympics']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'sports'


keywords = ['politics', 'elections', 'election']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'politics'

keywords = ['business', 'elections', 'budget', 'tax', ]
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'business'

keywords = ['food', 'health', 'fitness', 'wellness', 'pregnancy', 'flu', 'virus']
pattern = '|'.join(keywords)
df.loc[df['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'health'

In [7]:
df['headline_category'].nunique()

384

In [8]:
xdf = df[df['headline_category'].isin(['India', 'health', 'business', 'politics', 'sports', 'tech', 'world', 'entertainment'])]

In [9]:
xdf

Unnamed: 0,publish_date,headline_category,headline_text
85,20010102,entertainment,Raju Chacha
126,20010103,entertainment,'Devdas': Jinxed?
272,20010104,India,Dudhwa tiger died of starvation; not poisoning
273,20010104,India,Three in race for chief secy's post
274,20010104,India,Druggists' stir leads to shortage of medicines
...,...,...,...
3876552,20230630,India,10 PIs move HC over thwarted seniority
3876553,20230630,India,Govt notifies award in memory of Parrikar for ...
3876554,20230630,India,After youth's death; PWD installs crash barrie...
3876555,20230630,India,Authorities not acting against CRZ violations


In [10]:
xdf['headline_category'].unique()

array(['entertainment', 'India', 'sports', 'world', 'health', 'tech',
       'politics', 'business'], dtype=object)

In [None]:
xdf['processed_text'] = xdf['headline_text'].apply(preprocess_text)
xdf.to_csv('processed_data.csv', index=False)
# xdf = pd.read_csv('processed_data.csv')

xdf.head()

In [None]:
# xdf = pd.read_csv('/content/drive/MyDrive/Project_Shruti/processed_data.csv')

# xdf.head()

In [None]:
keywords = ['politics']
pattern = '|'.join(keywords)  # Create regex pattern
xdf.loc[xdf['headline_category'].str.contains(pattern, case=False, na=False), 'headline_category'] = 'india'

In [None]:
xdf.to_csv('processed_data.csv', index=False)

In [None]:
display(xdf[xdf['headline_category'] == 'india'].shape)
display(xdf[xdf['headline_category'] == 'business'].shape)
display(xdf[xdf['headline_category'] == 'world'].shape)
display(xdf[xdf['headline_category'] == 'sports'].shape)
display(xdf[xdf['headline_category'] == 'tech'].shape)
# display(xdf[xdf['headline_category'] == 'politics'].shape)
# display(xdf[xdf['headline_category'] == 'entertainment'].shape)
# display(xdf[xdf['headline_category'] == 'health'].shape)

In [None]:
idf = xdf[xdf['headline_category'] == 'india'].tail(3000)
bdf = xdf[xdf['headline_category'] == 'business'].tail(1273)
wdf = xdf[xdf['headline_category'] == 'world'].tail(3000)
sdf = xdf[xdf['headline_category'] == 'sports'].tail(3000)
tdf = xdf[xdf['headline_category'] == 'tech'].tail(3000)
# pdf = xdf[xdf['headline_category'] == 'politics'].tail(3000)
# edf = xdf[xdf['headline_category'] == 'entertainment'].tail(3000)
# hdf = xdf[xdf['headline_category'] == 'health'].tail(3000)

In [None]:
combined_df = pd.concat([idf, bdf, wdf, sdf, tdf], ignore_index=True)
combined_df.head()

In [None]:
combined_df['headline_category'].unique()

In [None]:
X = combined_df['processed_text'].astype(str)
y = combined_df['headline_category']

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
y

In [None]:
label_map = {}

In [None]:
y_1 = y.copy()
y_1 = pd.DataFrame(y_1)
y_1['label'] = y_encoded
label_map = dict(zip(y_1['label'], y_1['headline_category']))
label_map

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Define base models
base_models = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42))
]

In [None]:
meta_model = LogisticRegression()

# Create stacking classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)

In [None]:
stacking_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = stacking_model.predict(X_test)

In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
pickle.dump(stacking_model, open("stack_model_2.pkl", "wb"))

In [None]:
pickle.dump(vectorizer, open("vectorizer_2.pkl", "wb"))