# Natural Processing language(NLP)

In [1]:
import pandas as pd
import string
import re
import spacy
from unidecode import unidecode

from sklearn.feature_extraction.text import TfidfVectorizer


### Função para pré-processamento:
- Transformam todos os caracteres em minusculas;
- Remove as acentuações
- Remove as pontuações
- Remove números

In [2]:
def cleaner(texto):
        texto = unidecode(texto)
        texto = texto.lower()
        texto = texto.translate(str.maketrans('','', string.punctuation))
        texto = re.sub(r'[0-9]+', '', texto)
        return texto

In [3]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


### Preparando a tabela para o treinamento:
- Junção das tabelas 'headline' e 'short_cription'
- Remoçao das tabelas 'link', 'authors', 'date', 'headline', 'short_description'
- Com a função cleaner limpamos a tabela

In [4]:
df.insert(loc=0, column= 'description', value= df['headline'] + df['short_description'])
df = df.drop(columns=['link', 'authors', 'date', 'headline', 'short_description'])
df['description'] = df['description'].apply(cleaner)
df.head()

Unnamed: 0,description,category
0,over million americans roll up sleeves for om...,U.S. NEWS
1,american airlines flyer charged banned for lif...,U.S. NEWS
2,of the funniest tweets about cats and dogs th...,COMEDY
3,the funniest tweets from parents this week sep...,PARENTING
4,woman who called cops on black birdwatcher los...,U.S. NEWS


### Remoção de Stop Words

In [5]:
nlp = spacy.load("en_core_web_sm")
words = nlp.Defaults.stop_words

def stopWords(text):
    text = [word for word in text.split() if word not in list(words)]
    return ' '.join(text)
print(words)
df['description'] = df['description'].apply(stopWords)
df.head()  

{'least', 'somewhere', 'though', 'onto', 'whence', 'because', 'so', 'below', '‘d', 'has', 'anyway', 'both', 'neither', 'through', 'your', 'where', 'meanwhile', 'perhaps', '’ll', 'off', 'himself', 'i', 'whereas', 'was', 'last', 'can', 'anyone', 'sometimes', 'between', 'have', 'we', 'yourselves', 'these', 'beyond', 'such', 'regarding', 'amount', 'part', 'nevertheless', 'toward', 'upon', 'what', 'would', 'also', 'is', 'serious', 'among', 'every', 'yet', 'n’t', 'he', 'even', 'thereupon', 'doing', 'those', 'seems', 'although', '‘re', 'further', 'they', 'using', 'were', 'itself', '’m', 'elsewhere', 'please', 'ours', 'my', 'anyhow', 'via', "'s", 'thus', 'therein', 'whatever', 'there', 'sixty', 'each', 'sometime', 'whose', 'for', 'always', 'go', 'of', 'namely', '‘ve', 'not', 'an', 'no', 'about', 'done', 'say', 'unless', 'move', 'formerly', 'thru', 'third', 'on', 'without', 'most', 'against', 'quite', 'whither', 'whole', 'else', 'under', 'everyone', "'ve", 'thence', 'used', 'hence', 'when', 'di

Unnamed: 0,description,category
0,million americans roll sleeves omicrontargeted...,U.S. NEWS
1,american airlines flyer charged banned life pu...,U.S. NEWS
2,funniest tweets cats dogs week sept dog dont u...,COMEDY
3,funniest tweets parents week sept accidentally...,PARENTING
4,woman called cops black birdwatcher loses laws...,U.S. NEWS


### Algoritmo de treinamento

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes

In [7]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['description'], df['category'], test_size=0.3)

Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

### Contagem das palavras

In [8]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['description'])

train_x_Tdfidf = Tfidf_vect.transform(train_x)
test_x_Tdfidf = Tfidf_vect.transform(test_x)
print(Tfidf_vect.vocabulary_)



### Naive Bayes

In [9]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_x_Tdfidf, train_y)

predictions_NB = Naive.predict(test_x_Tdfidf)
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, test_y)*100)

Naive Bayes Accuracy Score ->  50.08988370798135


### Support Vector Machine (SVM)

In [16]:
SVM = svm.SVC(kernel='linear')
SVM.fit(train_x_Tdfidf, train_y)

predictions_SVM = SVM.predict(test_x_Tdfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, test_y)*100)

Não consegui a precisão do SVM, por ele ser um algoritmo que demanda muito tempo para ser treinado, mas posso dizer que sua precisão é maior do que o NB.