In [79]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd

In [14]:
df = pd.read_csv('es_news.csv', encoding='UTF-8')
df.head()

Unnamed: 0,url,news,Type
0,https://www.larepublica.co/redirect/post/3201905,Durante el foro La banca articulador empresari...,Otra
1,https://www.larepublica.co/redirect/post/3210288,El regulador de valores de China dijo el domin...,Regulaciones
2,https://www.larepublica.co/redirect/post/3240676,En una industria históricamente masculina como...,Alianzas
3,https://www.larepublica.co/redirect/post/3342889,Con el dato de marzo el IPC interanual encaden...,Macroeconomia
4,https://www.larepublica.co/redirect/post/3427208,Ayer en Cartagena se dio inicio a la versión n...,Otra


In [15]:
X = df['news']
y = df['Type']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

In [17]:
vectorizer = CountVectorizer()

X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [18]:
model = MultinomialNB()
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
print(metrics.accuracy_score(y_test, y_pred))

0.7704918032786885


# Stemming

In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [20]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darío\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darío\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
stemmer = SnowballStemmer('spanish')

In [22]:
def tokenize_and_stem(text):
    tokens = word_tokenize(text.lower())
    stems = [stemmer.stem(token) for token in tokens if token.isalpha()]
    return ' '.join(stems)

In [23]:
df['news_stemmer'] = df['news'].apply(tokenize_and_stem)

In [24]:
df['news_stemmer']

0       durant el for la banc articul empresarial par ...
1       el regul de valor de chin dij el doming que bu...
2       en una industri histor masculin com lo es la a...
3       con el dat de marz el ipc interanual encaden s...
4       ayer en cartagen se dio inici a la version num...
                              ...                        
1212    en la vid de tod empres emergent lleg un momen...
1213    la espiral alcist de los preci continu y gener...
1214    las grand derrot nacional son experient trauma...
1215    bbva ha alcanz un acuerd de colabor con barcel...
1216    casi entrand a la part final de noviembr la ep...
Name: news_stemmer, Length: 1217, dtype: object

In [25]:
X_stem = df['news_stemmer']
X_train_stem, X_test_stem, y_train, y_test = train_test_split(X_stem, y, test_size=0.2, random_state=16)

In [26]:
X_train_stem_transformed = vectorizer.fit_transform(X_train_stem)
X_test_stem_transformed = vectorizer.transform(X_test_stem)

In [27]:
model_stem = MultinomialNB()
model_stem.fit(X_train_stem_transformed, y_train)
y_pred_stem = model_stem.predict(X_test_stem_transformed)
print(metrics.accuracy_score(y_test, y_pred_stem))
print(X_train_stem_transformed.shape)

0.7704918032786885
(973, 11924)


# Lemmatization

In [28]:
import spacy

In [29]:
nlp = spacy.load('es_core_news_sm')

In [30]:
def lemmatize_text(text):
    doc = nlp(text.lower())
    lemmas = [token.lemma_ for token in doc if token.is_alpha]
    return ' '.join(lemmas)

In [35]:
df['news_lemma'] = df['news'].apply(lemmatize_text)

In [36]:
df['news_lemma']

0       durante el foro el banca articulador empresari...
1       el regulador de valor de china decir el doming...
2       en uno industria históricamente masculino como...
3       con el dato de marzo el ipc interanual encaden...
4       ayer en cartagena él dar inicio a el versión n...
                              ...                        
1212    en el vida de todo empresa emergente llegar un...
1213    el espiral alcista de el precio continuar y ge...
1214    el grande derrota nacional ser experiencia tra...
1215    bbva haber alcanzar uno acuerdo de colaboració...
1216    casi entrar a el parte final de noviembre el é...
Name: news_lemma, Length: 1217, dtype: object

In [37]:
X_lemma = df['news_lemma']
X_train_lemma, X_test_lemma, y_train, y_test = train_test_split(X_lemma, y, test_size=0.2, random_state=16)

X_train_lemma_transformed = vectorizer.fit_transform(X_train_lemma)
X_test_lemma_transformed = vectorizer.transform(X_test_lemma)

model_lemma = MultinomialNB()
model_lemma.fit(X_train_lemma_transformed, y_train)
y_pred_lemma = model_lemma.predict(X_test_lemma_transformed)
print(metrics.accuracy_score(y_test, y_pred_lemma))
print(X_train_lemma_transformed.shape)

0.7663934426229508
(973, 16574)


# Removing stopwords

In [63]:
def stopwords_removal(text):
    doc = text.lower().split(' ')
    es_words = stopwords.words('spanish')
    words = [ token for token in doc if token not in es_words]
    return ' '.join(words)

In [64]:
df['news_clean'] = df['news'].apply(stopwords_removal)

### Lemmatization

In [74]:
df['news_lemma_clean'] = df['news_clean'].apply(lemmatize_text)

In [76]:
X_lemma_clean = df['news_lemma_clean']
X_train_lemma_clean, X_test_lemma_clean, y_train, y_test = train_test_split(X_lemma_clean, y, test_size=0.2, random_state=16)

X_train_lemma_clean_transformed = vectorizer.fit_transform(X_train_lemma_clean)
X_test_lemma_clean_transformed = vectorizer.transform(X_test_lemma_clean)

model_lemma_clean = MultinomialNB()
model_lemma_clean.fit(X_train_lemma_clean_transformed, y_train)

y_pred_lemma_clean = model_lemma_clean.predict(X_test_lemma_clean_transformed)
print(metrics.accuracy_score(y_test, y_pred_lemma_clean))
print(X_train_lemma_clean_transformed.shape)

0.7827868852459017
(973, 17935)


### Stemming

In [77]:
df['news_stemmer_clean'] = df['news_clean'].apply(tokenize_and_stem)

In [78]:
X_stem_clean = df['news_stemmer_clean']
X_train_stem_clean, X_test_stem_clean, y_train, y_test = train_test_split(X_stem_clean, y, test_size=0.2, random_state=16)

X_train_stem_clean_transformed = vectorizer.fit_transform(X_train_stem_clean)
X_test_stem_clean_transformed = vectorizer.transform(X_test_stem_clean)

model_stem_clean = MultinomialNB()
model_stem_clean.fit(X_train_stem_clean_transformed, y_train)

y_pred_stem_clean = model_stem_clean.predict(X_test_stem_clean_transformed)
print(metrics.accuracy_score(y_test, y_pred_stem_clean))
print(X_train_stem_clean_transformed.shape)


0.7991803278688525
(973, 11900)
