<a href="https://colab.research.google.com/github/Samyak0204/News-Recommender-System-with-Online-Training/blob/main/news_category_predictor/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
stop_words=set(stopwords.words("english"))
lemmatizer=WordNetLemmatizer()

def clean_text(text):
  if pd.isna(text):
    return ""
  text=re.sub(r"<.*?>"," ",text)
  text=text.lower()
  text=text.translate(str.maketrans("","",string.punctuation))
  tokens=text.split()
  tokens=[lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
  return " ".join(tokens)

In [3]:
mind=pd.read_csv('/content/drive/MyDrive/news recommender/news.tsv',sep='\t',header=None)
mind.columns=["newsid","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
mind.head()

Unnamed: 0,newsid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [4]:
kaggle_df=pd.read_csv('/content/drive/MyDrive/news recommender/result_final.csv')
kaggle_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,link,text,title,date,keywords,summary,title_summary
0,0,0,http://techcrunch.com/2020/09/07/vodafone-idea...,"Vodafone Idea, one of the largest telecom oper...",Indian telecom giant Vodafone Idea rebrands as...,2020-09-07 00:00:00,"['rebrands', 'idea', 'vi', 'giant', 'brand', '...","Vodafone Idea, one of the largest telecom oper...",Indian telecom giant Vodafone Idea rebrands as...
1,1,1,http://techcrunch.com/2020/09/16/facebook-addr...,"At the beginning of the previous decade, Faceb...",Facebook addresses political controversy in In...,2020-09-16 00:00:00,"['opportunities', 'whatsapp', 'controversy', '...",Politicians in the country today heavily rely ...,Facebook addresses political controversy in In...
2,2,2,http://techcrunch.com/2020/09/14/youtube-launc...,"As TikTok’s fate in the U.S. remains murky, Yo...","YouTube launches its TikTok rival, YouTube Sho...",2020-09-14 00:00:00,"['rival', 'video', 'feature', 'access', 'youtu...","As TikTok’s fate in the U.S. remains murky, Yo...","YouTube launches its TikTok rival, YouTube Sho..."
3,3,3,http://techcrunch.com/2020/09/09/groww-an-inve...,Even as more than 150 million people are using...,"Groww, an investment app for millennials in In...",2020-09-09 00:00:00,"['world', 'yc', 'continuity', 'stocks', 'groww...","YC Continuity, the growth-stage investment fun...","Groww, an investment app for millennials in In..."
4,4,4,http://techcrunch.com/2020/09/15/lanzatech-is-...,As part of the continuing global rollout of La...,LanzaTech is developing a small-scale waste bi...,2020-09-15 00:00:00,"['production', 'distributed', 'developing', 's...",As part of the continuing global rollout of La...,LanzaTech is developing a small-scale waste bi...


In [5]:
mind['content']=(mind['title'].fillna("")+" "+mind['abstract'].fillna("")).apply(clean_text)
kaggle_df['content']=(kaggle_df['title'].fillna("")+" "+kaggle_df['text'].fillna("")).apply(clean_text)

In [6]:
X=mind['content']
y=mind['category']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

clf = Pipeline([
    ("tfidf",TfidfVectorizer(max_features=5000)),
    ("logreg",LogisticRegression(max_iter=200))
])

clf.fit(X_train,y_train)

In [7]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

        autos       0.70      0.50      0.58       612
entertainment       0.66      0.22      0.33       153
      finance       0.63      0.51      0.56      1227
 foodanddrink       0.78      0.71      0.74       881
       health       0.72      0.60      0.65       608
         kids       0.00      0.00      0.00        15
    lifestyle       0.56      0.45      0.50       897
       movies       0.70      0.41      0.51       179
        music       0.79      0.48      0.60       249
         news       0.65      0.84      0.74      6140
       sports       0.90      0.94      0.92      6368
       travel       0.54      0.40      0.46       958
           tv       0.55      0.25      0.35       243
        video       0.61      0.29      0.39       928
      weather       0.76      0.67      0.71       848

     accuracy                           0.74     20306
    macro avg       0.64      0.48      0.54     20306
 weighte

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
joblib.dump(clf,"category_classifier.pkl")

['category_classifier.pkl']

In [9]:
kaggle_df['category']=clf.predict(kaggle_df['content'])

In [10]:
combined=pd.concat([mind[["title","abstract","content","category","url"]].rename(columns={"url":"source"}),kaggle_df[["title","text","content","category","link"]].rename(
        columns={"text": "abstract","link":"source"})], ignore_index=True)

In [11]:
combined.to_csv("merged_news_dataset_final.csv", index=False)

print("Merged Dataset")
print(combined.head())

Merged Dataset
                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1    Walmart Slashes Prices on Last-Generation iPads   
2                      50 Worst Habits For Belly Fat   
3  Dispose of unwanted prescription drugs during ...   
4  The Cost of Trump's Aid Freeze in the Trenches...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  Apple's new iPad releases bring big deals on l...   
2  These seemingly harmless habits are holding yo...   
3                                                NaN   
4  Lt. Ivan Molchanets peeked over a parapet of s...   

                                             content   category  \
0  brand queen elizabeth prince charles prince ph...  lifestyle   
1  walmart slash price lastgeneration ipads apple...       news   
2  50 worst habit belly fat seemingly harmless ha...     health   
3  dispose unwanted prescription drug dea t

In [12]:
combined.tail()

Unnamed: 0,title,abstract,content,category,source
103712,Brazil's Guedes finds influence waning as Bols...,BRASILIA (Reuters) - Paulo Guedes has faced hi...,brazil guedes find influence waning bolsonaro ...,news,https://uk.reuters.com/article/uk-brazil-econo...
103713,Bank of England gears up for next stimulus push,LONDON (Reuters) - The Bank of England is expe...,bank england gear next stimulus push london re...,finance,https://www.reuters.com/article/us-britain-boe...
103714,Syria says U.S. sanctions behind acute fuel cr...,AMMAN (Reuters) - Syria is experiencing worsen...,syria say u sanction behind acute fuel crisis ...,news,https://www.reuters.com/article/syria-crisis-f...
103715,"Business & Financial News, U.S & International...",Curiosity high for TV's anything-can-happen vi...,business financial news u international breaki...,news,https://af.reuters.com/article/uk-iran-usa-wor...
103716,METALS-LME copper falls most in seven sessions...,"MELBOURNE, Sept 17 (Reuters) - London copper f...",metalslme copper fall seven session stronger d...,finance,https://in.reuters.com/article/global-metals-i...


In [13]:
from google.colab import files
files.download("merged_news_dataset_final.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
files.download("category_classifier.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>