In [None]:
!tar -xvf/content/20news-bydate.tar.gz


In [30]:
import sklearn.datasets as skd
import pandas as pd
import numpy as py
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
news_train = skd.load_files("/content/20news-bydate-train",encoding = "ISO-8859-1")
news_test = skd.load_files("/content/20news-bydate-test",encoding = "ISO-8859-1")


In [None]:
news_train.data

In [7]:
stop_words = stopwords.words("english")
stop_words = stop_words + list(string.printable)
stop_words = stop_words + ["subject","organization","university","lines","com","ac","edu"]

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
news_train_df = pd.DataFrame({"text": news_train.data,"category":news_train.target}) # train dataframe

In [10]:
news_test_df = pd.DataFrame({"text": news_test.data,"category":news_test.target}) #test dataframe

In [11]:
news_train_df["clean text"] = news_train_df["text"].apply(lambda x: " ".join([lemmatizer.lemmatize(word.lower())\
                         for word in word_tokenize(re.sub(r'([^\s\w]+|_+|[0-9]+)'," ",str(x))) if word.lower() not in stop_words]))

In [16]:
news_test_df["clean text"] = news_test_df["text"].apply(lambda x: " ".join([lemmatizer.lemmatize(word.lower())\
                         for word in word_tokenize(re.sub(r'([^\s\w]|_|[0-9])+'," ",str(x))) if word.lower() not in stop_words]))

In [None]:
news_train_df.drop(["text"],axis =1)

Unnamed: 0,category,clean text
0,2,csd keele uk yearsley change licensed data win...
1,2,arizona state james cassidy ifjxc asuacad bitn...
2,5,davewood bruno c colorado david rex wood error...
3,2,gay selkirk sfu ca ian gay change licensed dat...
4,0,tony lezard tony mantis co uk atheist distribu...
...,...,...
6842,8,bloom inland extraordinary footpeg engineering...
6843,5,stevedav netcom steve davidson moolit olit net...
6844,2,herrod c stanford stephen herrod do window mac...
6845,4,cherkaue ee rochester brian cherkauer iivx upg...


In [None]:
news_test_df.drop(["text"],axis =1)

## **CountVectorizer**

In [31]:
count_vect = CountVectorizer()
X_train_cv = count_vect.fit_transform(news_train_df["clean text"]) 
X_test_cv = count_vect.transform(news_test_df["clean text"]) 
mnb = MultinomialNB() 
y_train = news_train_df['category']
mnb.fit(X_train_cv, y_train) 
y_pred_cv_mnb = mnb.predict(X_test_cv) 
y_test = news_test_df["category"]
print(accuracy_score(y_test, y_pred_cv_mnb))
print(classification_report(y_test, y_pred_cv_mnb))

0.8020446096654275
              precision    recall  f1-score   support

           0       0.79      0.82      0.80       319
           1       0.65      0.80      0.72       389
           2       0.79      0.04      0.07       394
           3       0.56      0.79      0.65       392
           4       0.80      0.82      0.81       385
           5       0.69      0.82      0.75       395
           6       0.91      0.72      0.80       390
           7       0.87      0.91      0.89       396
           8       0.95      0.94      0.94       398
           9       0.97      0.92      0.94       397
          10       0.93      0.97      0.95       399
          11       0.78      0.96      0.86       396
          12       0.79      0.71      0.75       393
          13       0.89      0.86      0.87       396
          14       0.85      0.91      0.88       394
          15       0.82      0.95      0.88       398
          16       0.72      0.93      0.81       364
        

## **TfidfVectorizer**

In [33]:
tfidfV = TfidfVectorizer() 
X_train_tfidfV = tfidfV.fit_transform(news_train_df["clean text"]) 
X_test_tfidfV = tfidfV.transform(news_test_df["clean text"])
mnb = MultinomialNB() 
y_train = news_train_df['category']
mnb.fit(X_train_tfidfV, y_train) 
y_pred_tfidfV_mnb = mnb.predict(X_test_tfidfV) 
y_test = news_test_df["category"]
print(accuracy_score(y_test, y_pred_tfidfV_mnb))
print(classification_report(y_test, y_pred_tfidfV_mnb))

0.8118693574083908
              precision    recall  f1-score   support

           0       0.82      0.68      0.74       319
           1       0.78      0.71      0.74       389
           2       0.80      0.69      0.74       394
           3       0.64      0.81      0.72       392
           4       0.86      0.81      0.83       385
           5       0.87      0.80      0.83       395
           6       0.88      0.75      0.81       390
           7       0.87      0.92      0.90       396
           8       0.94      0.96      0.95       398
           9       0.93      0.93      0.93       397
          10       0.89      0.98      0.93       399
          11       0.72      0.97      0.82       396
          12       0.82      0.65      0.73       393
          13       0.92      0.78      0.84       396
          14       0.85      0.93      0.89       394
          15       0.62      0.96      0.75       398
          16       0.65      0.95      0.78       364
        