In [176]:
import collections
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [84]:
data = pd.read_json('../data/News_Category_Dataset_v3.json', lines=True)

In [53]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9,Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters,U.S. NEWS,Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe,"American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video",U.S. NEWS,"He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.",Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets-cats-dogs-september-17-23_n_632de332e4b0695c1d81dc02,23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23),COMEDY,"""Until you have a dog you don't understand what could be eaten.""",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parenting-tweets_l_632d7d15e4b0d12b5403e479,The Funniest Tweets From Parents This Week (Sept. 17-23),PARENTING,"""Accidentally put grown-up toothpaste on my toddler’s toothbrush and he screamed like I was cleaning his teeth with a Carolina Reaper dipped in Tabasco sauce.""",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-loses-discrimination-lawsuit-franklin-templeton_n_632c6463e4b09d8701bd227e,Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer,U.S. NEWS,Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.,Nina Golgowski,2022-09-22


In [55]:
df.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [62]:
list_of_headlines = df.loc[df['category']=='CRIME', 'headline'].to_list()

In [67]:
list_of_words = [word for headline in list_of_headlines for word in headline.split()]
words_dict = collections.Counter(list_of_words)
words_dict

Counter({'Memphis': 1,
         'Police:': 1,
         'Arrest': 1,
         'Made': 1,
         'In': 2,
         "Jogger's": 1,
         'Disappearance': 1,
         'Trump': 1,
         'Org.': 1,
         'CFO': 1,
         'To': 3,
         'Plead': 1,
         'Guilty,': 1,
         'Testify': 1,
         'Against': 1,
         'Company': 1,
         'Officials:': 1,
         'NH': 1,
         'Missing': 1,
         'Girl': 1,
         'Case': 1,
         'Shifts': 1,
         'Homicide': 1,
         'Probe': 1,
         'Albuquerque': 2,
         'Police': 2,
         'Share': 1,
         'Photo': 1,
         'Of': 3,
         'Car': 1,
         'Eyed': 1,
         'Slayings': 1,
         'Muslim': 2,
         'Men': 1,
         'Tell': 1,
         'Community': 1,
         'Be': 1,
         "'Vigilant'": 1,
         'Amid': 1,
         'Series': 1,
         'Murders': 1})

In [77]:
df = pd.DataFrame.from_dict(words_dict, orient='index').reset_index().rename(columns={"index": "word", 0: "count"})
df.head()

Unnamed: 0,word,count
0,Memphis,1
1,Police:,1
2,Arrest,1
3,Made,1
4,In,2


In [180]:
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward! a $6')

['Eighty', 'seven', 'miles', 'to', 'go', 'yet', 'Onward', 'a', '6']

In [186]:
def punctioation_fix(sentence):  
    tokenizer = RegexpTokenizer(r'\w+')
    words_list = tokenizer.tokenize(sentence.lower())
    return words_list

In [100]:
def words_stemming(words_list):    
    ps = PorterStemmer()
    stem_words = [ps.stem(word) for word in words_list]
    return stem_words

In [101]:
def reduce_stop_words(words_list):
    text = ' '.join(words_list)
    filtered_sentence = remove_stopwords(text)
    return filtered_sentence.split()

In [181]:
def adjust_headline(sentence):    
    words_list = punctioation_fix(sentence)
    
    words_list = [word for word in words_list if len(word) > 1]        
    
    words_list = words_stemming(words_list)
    
    words_list = reduce_stop_words(words_list)
    
    return ' '.join(words_list)

In [182]:
def create_vocabulary(data, label_column, data_column):
    list_of_headlines = data[data_column].apply(adjust_headline).tolist()
    set_of_words = set([word for headline in list_of_headlines for word in headline.split()])

    columns = data[label_column].unique().tolist()
    columns =  [col.lower().replace('&', 'n').replace(' ', '_') for col in columns]

    columns_dict = {i: col for i, col in enumerate(columns)}
    columns_dict['index'] = 'word'
    df = pd.DataFrame(index=set_of_words , columns=np.arange(len(columns))).reset_index().rename(columns=columns_dict).fillna(0)
    
    return df, columns

In [None]:
def update_category_values(df):
    for cat in data['category'].unique():
        list_of_headlines = data.loc[data['category']==cat, 'headline'].to_list()
        list_of_words = [word for headline in list_of_headlines for word in adjust_headline(headline).split()]
        words_dict = collections.Counter(list_of_words)
        
        for word, value in words_dict.items():
            df.loc[df["word"] == word, "crime"] = value
            
    return df
df = update_category_values(df)