In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
import re
from nltk import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json('/content/drive/My Drive/Text Mining/archive/News_Category_Dataset_v3.json', lines = True)
df.head(5)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
print("Number of rows (before deletion): ", len(df))

df = df[((df['headline'].str.strip() != "")  | (df['short_description'].str.strip() != "")) & (df['category'].str.strip() != "")]
df.reset_index(drop = True)

print("Number of rows (after deletion): ", len(df))

Number of rows (before deletion):  209527
Number of rows (after deletion):  209522


In [5]:
def tokenize(text):
    text = re.sub("[ ]{2,}", " ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text)
    return " ".join(word_tokenize(text.lower()))

df['headline_tokenized'] = df['headline'].apply(lambda x: tokenize(x))
df['short_description_tokenized'] = df['short_description'].apply(lambda x: tokenize(x))

df['text'] = df['headline_tokenized'] + " " + df['short_description_tokenized']

df = df.reset_index(drop = True)

In [6]:
df.head(5)

Unnamed: 0,link,headline,category,short_description,authors,date,headline_tokenized,short_description_tokenized,text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,over 4 million americans roll up sleeves for o...,health experts said it is too early to predict...,over 4 million americans roll up sleeves for o...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,american airlines flyer charged banned for lif...,he was subdued by passengers and crew when he ...,american airlines flyer charged banned for lif...
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 of the funniest tweets about cats and dogs ...,until you have a dog you don t understand what...,23 of the funniest tweets about cats and dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,the funniest tweets from parents this week sep...,accidentally put grown up toothpaste on my tod...,the funniest tweets from parents this week sep...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,woman who called cops on black bird watcher lo...,amy cooper accused investment firm franklin te...,woman who called cops on black bird watcher lo...


In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['category'])

train_df = train_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

In [8]:
train_data = train_df.iloc[:]['text']
count_vectorizer_unigram = CountVectorizer(max_features = 20000, ngram_range=(1,1))
count_vectorizer_unigram.fit(train_data)
train_vector = count_vectorizer_unigram.transform(train_data)

y_train = train_df.iloc[:]['category']

print(train_vector.shape)

(167617, 20000)


In [9]:
test_data = test_df.iloc[:]['text']
test_vector = count_vectorizer_unigram.transform(test_data)

y_test = test_df.iloc[:]['category']

print(test_vector.shape)

(41905, 20000)


In [10]:
# Naive Bayes using Count Vectors

skf = StratifiedKFold(n_splits=5)

i = 1

f1s_nb = list()

for train_index, test_index in skf.split(train_vector, y_train):
    X_Train_NB_KFold = train_vector[train_index]
    X_Test_NB_KFold = train_vector[test_index]
    Y_Train_NB_KFold = y_train[train_index]
    Y_Test_NB_KFold = y_train[test_index]

    naive_bayes = MultinomialNB()
    naive_bayes.fit(X_Train_NB_KFold, Y_Train_NB_KFold)
    Predictions_NB_KFold = naive_bayes.predict(X_Test_NB_KFold)

    print("Performance metrics at fold ", i  ," are : ")
    print("Accuracy score: ", accuracy_score(Y_Test_NB_KFold, Predictions_NB_KFold))
    print("Precision (weighted): ", precision_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted'))
    print("Recall (weighted): ", recall_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted'))

    f1_nb = f1_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted')
    f1s_nb.append(f1_nb)

    print("F1 Score (weighted): ", f1_nb)
    print()
    i+=1

Performance metrics at fold  1  are : 
Accuracy score:  0.5732907767569503
Precision (weighted):  0.563387536318956
Recall (weighted):  0.5732907767569503
F1 Score (weighted):  0.5483328184394336

Performance metrics at fold  2  are : 
Accuracy score:  0.5698007397685241
Precision (weighted):  0.5616891871650729
Recall (weighted):  0.5698007397685241
F1 Score (weighted):  0.5455330028345223

Performance metrics at fold  3  are : 
Accuracy score:  0.5687438475076813
Precision (weighted):  0.5624462089726713
Recall (weighted):  0.5687438475076813
F1 Score (weighted):  0.5442187216700818

Performance metrics at fold  4  are : 
Accuracy score:  0.5745010888047013
Precision (weighted):  0.5651989384631961
Recall (weighted):  0.5745010888047013
F1 Score (weighted):  0.5494589472175279

Performance metrics at fold  5  are : 
Accuracy score:  0.5725919517942905
Precision (weighted):  0.5639954741633969
Recall (weighted):  0.5725919517942905
F1 Score (weighted):  0.547386938743155



In [11]:
naive_bayes = MultinomialNB()
naive_bayes.fit(train_vector, y_train)

predictions_y =  naive_bayes.predict(test_vector)

print("Accuracy score: ", accuracy_score(y_test, predictions_y))
print("Precision (weighted): ", precision_score(y_test, predictions_y, average = 'weighted'))
print("Recall (weighted): ", recall_score(y_test, predictions_y, average = 'weighted'))
print("F1 score (weighted): ", f1_score(y_test, predictions_y, average = 'weighted'))

Accuracy score:  0.5779739887841546
Precision (weighted):  0.5721136511738948
Recall (weighted):  0.5779739887841546
F1 score (weighted):  0.5588115549581871


In [None]:
# Linear SVM using Count Vectors

skf = StratifiedKFold(n_splits=2)

sc = StandardScaler(with_mean=False)
le = LabelEncoder()

i = 1

f1s_svm = list()

for train_index, test_index in skf.split(train_vector, y_train):
    X_Train_Svm_KFold = train_vector[train_index]
    X_Test_Svm_KFold = train_vector[test_index]

    sc.fit(X_Train_Svm_KFold)
    X_Train_Svm_KFold_std = sc.transform(X_Train_Svm_KFold)
    X_Test_Svm_KFold_std = sc.transform(X_Test_Svm_KFold)

    Y_Train_Svm_KFold = y_train[train_index]
    Y_Test_Svm_KFold = y_train[test_index]

    Y_Train_Svm_KFold = le.fit_transform(Y_Train_Svm_KFold)
    Y_Test_Svm_KFold = le.fit_transform(Y_Test_Svm_KFold)

    linear_svm = svm.SVC(C=1.0, kernel='linear')
    linear_svm.fit(X_Train_Svm_KFold_std, Y_Train_Svm_KFold)
    Predictions_Svm_KFold = linear_svm.predict(X_Test_Svm_KFold_std)

    print("Performance metrics at fold ", i ," are : ")
    print("Accuracy score: ", accuracy_score(Y_Test_Svm_KFold, Predictions_Svm_KFold))
    print("Precision (weighted): ", precision_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted'))
    print("Recall (weighted): ", recall_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted'))

    f1_svm = f1_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted')
    f1s_svm.append(f1_svm)

    print("F1 Score (weighted): ", f1_svm)
    print()
    i+=1

In [12]:
sc = StandardScaler(with_mean=False)
le = LabelEncoder()

sc.fit(train_vector)
train_vector_std = sc.transform(train_vector)
test_vector_std = sc.transform(test_vector)

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)

linear_svm = svm.SVC(C=1.0, kernel='linear')
linear_svm.fit(train_vector_std, y_train_encoded)
predictions_y = linear_svm.predict(test_vector_std)

print("Accuracy score: ", accuracy_score(y_test_encoded, predictions_y))
print("Precision (weighted): ", precision_score(y_test_encoded, predictions_y, average = 'weighted'))
print("Recall (weighted): ", recall_score(y_test_encoded, predictions_y, average = 'weighted'))
print("F1 score (weighted): ", f1_score(y_test_encoded, predictions_y, average = 'weighted'))


Accuracy score:  0.5102732370838802
Precision (weighted):  0.527620332321137
Recall (weighted):  0.5102732370838802
F1 score (weighted):  0.5157862599709725


In [14]:
train_data = train_df.iloc[:]['text']

tfidf_vectorizer_unigram = TfidfVectorizer(max_features = 20000, ngram_range=(1,1))
tfidf_vectorizer_unigram.fit(train_data)
train_vector = tfidf_vectorizer_unigram.transform(train_data)

y_train = train_df.iloc[:]['category']

print(train_vector.shape)

(167617, 20000)


In [15]:
test_data = test_df.iloc[:]['text']
test_vector = tfidf_vectorizer_unigram.transform(test_data)

y_test = test_df.iloc[:]['category']

print(test_vector.shape)

(41905, 20000)


In [16]:
# Naive Bayes using TFIDF vectors

skf = StratifiedKFold(n_splits=5)

i = 1

f1s_nb = list()

for train_index, test_index in skf.split(train_vector, y_train):
    X_Train_NB_KFold = train_vector[train_index]
    X_Test_NB_KFold = train_vector[test_index]
    Y_Train_NB_KFold = y_train[train_index]
    Y_Test_NB_KFold = y_train[test_index]

    naive_bayes = MultinomialNB()
    naive_bayes.fit(X_Train_NB_KFold, Y_Train_NB_KFold)
    Predictions_NB_KFold = naive_bayes.predict(X_Test_NB_KFold)

    print("Performance metrics at fold ", i  ," are : ")
    print("Accuracy score: ", accuracy_score(Y_Test_NB_KFold, Predictions_NB_KFold))
    print("Precision (weighted): ", precision_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted', labels=np.unique(Predictions_NB_KFold)))
    print("Recall (weighted): ", recall_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted', labels=np.unique(Predictions_NB_KFold)))

    f1_nb = f1_score(Y_Test_NB_KFold, Predictions_NB_KFold, average = 'weighted', labels=np.unique(Predictions_NB_KFold))
    f1s_nb.append(f1_nb)

    print("F1 Score (weighted): ", f1_nb)
    print()
    i+=1

Performance metrics at fold  1  are : 
Accuracy score:  0.4613411287435867
Precision (weighted):  0.6013157204848885
Recall (weighted):  0.4997576501761075
F1 Score (weighted):  0.39448843222077706

Performance metrics at fold  2  are : 
Accuracy score:  0.4616394225032812
Precision (weighted):  0.6036440505795081
Recall (weighted):  0.513453435519724
F1 Score (weighted):  0.4056298971363276

Performance metrics at fold  3  are : 
Accuracy score:  0.45917728126957613
Precision (weighted):  0.613747419655402
Recall (weighted):  0.501972933311593
F1 Score (weighted):  0.3963204006770506

Performance metrics at fold  4  are : 
Accuracy score:  0.4642484264534797
Precision (weighted):  0.6157131581536434
Recall (weighted):  0.5075001630470227
F1 Score (weighted):  0.4010604708220386

Performance metrics at fold  5  are : 
Accuracy score:  0.4640694448587537
Precision (weighted):  0.6070034076187157
Recall (weighted):  0.5026656757892016
F1 Score (weighted):  0.39796111623524344



In [19]:
naive_bayes = MultinomialNB()
naive_bayes.fit(train_vector, y_train)

predictions_y =  naive_bayes.predict(test_vector)

print("Accuracy score: ", accuracy_score(y_test, predictions_y))
print("Precision (weighted): ", precision_score(y_test, predictions_y, average = 'weighted', labels=np.unique(predictions_y)))
print("Recall (weighted): ", recall_score(y_test, predictions_y, average = 'weighted', labels=np.unique(predictions_y)))
print("F1 score (weighted): ", f1_score(y_test, predictions_y, average = 'weighted', labels=np.unique(predictions_y)))

Accuracy score:  0.47736546951437775
Precision (weighted):  0.6094488530280405
Recall (weighted):  0.5132915939648979
F1 score (weighted):  0.41665833939917407


In [None]:
# Linear SVM using TFIDF vectors

skf = StratifiedKFold(n_splits=2)

sc = StandardScaler(with_mean=False)
le = LabelEncoder()

i = 1

f1s_svm = list()

for train_index, test_index in skf.split(train_vector, y_train):
    X_Train_Svm_KFold = train_vector[train_index]
    X_Test_Svm_KFold = train_vector[test_index]

    sc.fit(X_Train_Svm_KFold)
    X_Train_Svm_KFold_std = sc.transform(X_Train_Svm_KFold)
    X_Test_Svm_KFold_std = sc.transform(X_Test_Svm_KFold)

    Y_Train_Svm_KFold = y_train[train_index]
    Y_Test_Svm_KFold = y_train[test_index]

    Y_Train_Svm_KFold = le.fit_transform(Y_Train_Svm_KFold)
    Y_Test_Svm_KFold = le.fit_transform(Y_Test_Svm_KFold)

    linear_svm = svm.SVC(C=1.0, kernel='linear', gamma='auto')
    linear_svm.fit(X_Train_Svm_KFold_std, Y_Train_Svm_KFold)
    Predictions_Svm_KFold = linear_svm.predict(X_Test_Svm_KFold_std)

    print("Performance metrics at fold ", i ," are : ")
    print("Accuracy score: ", accuracy_score(Y_Test_Svm_KFold, Predictions_Svm_KFold))
    print("Precision (weighted): ", precision_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted'))
    print("Recall (weighted): ", recall_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted'))

    f1_svm = f1_score(Y_Test_Svm_KFold, Predictions_Svm_KFold, average = 'weighted')
    f1s_svm.append(f1_svm)

    print("F1 Score (weighted): ", f1_svm)
    print()
    i+=1

In [None]:
sc = StandardScaler(with_mean=False)
le = LabelEncoder()

sc.fit(train_vector)
train_vector_std = sc.transform(train_vector)
test_vector_std = sc.transform(test_vector)

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)

linear_svm = svm.SVC(C=1.0, kernel='linear')
linear_svm.fit(train_vector_std, y_train_encoded)
predictions_y = linear_svm.predict(test_vector_std)

print("Accuracy score: ", accuracy_score(y_test_encoded, predictions_y))
print("Precision (weighted): ", precision_score(y_test_encoded, predictions_y, average = 'weighted'))
print("Recall (weighted): ", recall_score(y_test_encoded, predictions_y, average = 'weighted'))
print("F1 score (weighted): ", f1_score(y_test_encoded, predictions_y, average = 'weighted'))

In [None]:
# Feature selection using mutual information

selector = SelectKBest(mutual_info_classif, k=5000)

train_vector_mutual_info_filtered = selector.fit_transform(train_vector, y_train)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def find_pos_tag(postag):
    if postag.startswith('J'):
        return wordnet.ADJ
    elif postag.startswith('V'):
        return wordnet.VERB
    elif postag.startswith('N'):
        return wordnet.NOUN
    elif postag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    l = list()
    postags = nltk.pos_tag(text.split(" "))
    for word, tag in postags:
        # print(find_pos_tag(word))
        a = lemmatizer.lemmatize(word, find_pos_tag(tag))
        l.append(a)
    return " ".join(l)

df['text_lemmatized'] = df['text'].apply(lambda x: lemmatize_text(x))