## News_Category using classification approach

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from fuzzywuzzy import fuzz
import re

import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
df.shape

(209527, 6)

In [4]:
# Taking samle of 500 data points

df = df.head(1000)

In [5]:
df['text'] = df['headline'] + ' ' + df['short_description']

In [6]:
import spacy
import string
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def preprocess_text(text):
    tokens = nlp(text)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = ' '.join(tokens)
    
    return tokens

df['processed_text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,text,processed_text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...,4 million americans roll sleeves omicron targe...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charge ban life punchi...
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 Of The Funniest Tweets About Cats And Dogs ...,23 funniest tweets cats dogs week sept. 17 23 ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,The Funniest Tweets From Parents This Week (Se...,funniest tweets parents week sept. 17 23 accid...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,Woman Who Called Cops On Black Bird-Watcher Lo...,woman cop black bird watcher lose lawsuit ex e...


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['category'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(lowercase=False)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

model = RandomForestClassifier

param_grid = {'criterion' : ['gini', 'entropy'],
        'n_estimators' : [10,100,200,300],
        'random_state': [42]
        }

rf_model = GridSearchCV(RandomForestClassifier(),param_grid, scoring='accuracy')
rf_model.fit(X_train,y_train)
rf_model.best_params_

best_model = rf_model.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1_Score = f1_score(y_test, y_pred, average="weighted")
print(f1_Score)


Accuracy: 0.59
0.5678406515388021


#### Similarity 

In [None]:
# Similarity Calculation
cosine_sim = cosine_similarity(feature_matrix)
euclidean_dist = euclidean_distances(feature_matrix)

In [None]:
# Jaccard Similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union

In [None]:
jaccard_sim = []

for i in range(len(df)):
    similarity_scores = []
    for j in range(len(df)):
        set1 = set(df['processed_text'].iloc[i])
        set2 = set(df['processed_text'].iloc[j])
        similarity_scores.append(jaccard_similarity(set1, set2))
    jaccard_sim.append(similarity_scores)

In [None]:
# Levenshtein Distance
levenshtein_dist = []

for i in range(len(df)):
    dist_scores = []
    for j in range(len(df)):
        dist_scores.append(fuzz.ratio(df['text'].iloc[i], df['text'].iloc[j]))
    levenshtein_dist.append(dist_scores)

In [None]:
# Calculate Jaro-Winkler similarity

jaro_sim = []
for i in range(len(df)):
    sim_scores = []
    for j in range(len(df)):
        sim_scores.append(fuzz.jaro_winkler(df['text'].iloc[i], df['text'].iloc[j]))
    jaro_sim.append(sim_scores)