In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
text= """US President Joe Biden says his administration is looking into what happened at a food distribution site in Gaza — where local health officials say more than 100 people were killed and hundreds more injured — and he admitted the incident is going to complicate negotiations in the region.

“We’re checking that out right now; there are two competing versions of what happened. I don’t have an answer yet,” the president told CNN’s Arlette Saenz at the White House on Thursday.

Asked by Saenz if he worried the deaths would complicate negotiations, he responded: “Oh, I know it will.”

But Biden still expressed optimism that a deal on the hostages and a potential ceasefire could be reached soon. """

In [None]:
text="""President-elect Joe Biden and his transition team are preparing for an early, all-out push to pass an ambitious new stimulus bill, while also drawing up plans for a flurry of executive actions aimed at delivering on campaign promises and undoing the Trump administration's efforts to undermine key government agencies.
Biden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises. At the same time, his team will take over the work of spearheading one of the most complicated, politically fraught mass vaccination campaigns in American history.
Biden's agenda for his first 100 days in office will, according to both those close to him and outside groups in contact with his top aides, center on two key avenues of action: the passage of a broad economic aid package and, where legislation is not necessary, a series of executive actions aimed at advancing his priorities. Containing the Covid-19 pandemic, launching an economic recovery and tackling racial inequality are his most urgent priorities, transition officials say."""

In [None]:
# this performs word tokenization -> this is used in simple models where each word is a feature
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokens = word_tokenize(text)
tokens[-30:]

In [None]:
# we often remove punctuation after tokenization since punctuation is unlikely to be a good predictive feature
tokens = [word for word in tokens if word.isalnum()]
tokens[:15]

In [None]:
# Alternative
# this performs sentence tokenizations -> can be used if you want to treat each sentence as a "feature"
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

In [None]:
# Part of speech can be a useful feature in itself, but is also heavily used in making lemmatization and stemming more effective
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens,lang='eng')[:15]
#explanation of all these codes can be found here: https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b

In [None]:
# stemming can be done as cleaning technique -> treats prefixes and suffixes.
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed = [ps.stem(w) for w in tokens]
stemmed[:15]

In [None]:
# lemmatization is a more context aware version of stemming, where we take the actual roots of individual words
# the problem is that such a dictionary may not exist for all languages and that it does not know what to do with new words
nltk.download('wordnet') # wordnet is the most well known lemmatizer for english
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
lemmatized[:15]

In [None]:
# lemmatization may still be a bit weak, mostly because the lemmatizer would like a bit more information about context to make decisions
display(lemmatizer.lemmatize("was"))
display(lemmatizer.lemmatize("was",wordnet.VERB))
display(lemmatizer.lemmatize("better"))
display(lemmatizer.lemmatize("better",wordnet.ADJ))
display(lemmatizer.lemmatize("canning"))
display(lemmatizer.lemmatize("canning",wordnet.NOUN))
display(lemmatizer.lemmatize("canning",wordnet.VERB))

In [None]:
# let's apply this to the all the newsfeed
nltk.download('averaged_perceptron_tagger')

# unfortunately pos_tag and lemmatize use different codes for parts of speech
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tokens]
lemmatized[:15]


In [None]:
#removal of stopwords allows us to reduce the noise in the data to focus on the signal
from nltk.corpus import stopwords
nltk.download('stopwords')

without_sw = [word for word in lemmatized if not word in stopwords.words()]
without_sw[:15]

In [None]:
" ".join(without_sw)

In [None]:
text.split('.')[1]

In [None]:
without_sw

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vect = CountVectorizer()
# fit creates one entry for each different word seen
bow_vect.fit([" ".join(without_sw)])

In [None]:
set(without_sw)

In [None]:
bow_vect.transform(['Joe Biden transition team prepare early push ambitious stimulus bill draw plan flurry executive action aim deliver campaign promise undo Trump administration effort undermine key government agency']).toarray()

In [None]:
bow_vect.transform(['economic economic']).toarray()

In [None]:
bow_vect.transform(['Joe work ambitious ambitious ambitoud economic rabbit']).toarray()

In [None]:
bow_vect.transform(['100']).toarray()

In [None]:
bow_vect.transform(['goncalo']).toarray()

In [None]:
#transform only considers the words that have been seen in fit
bow_vect.transform(['accord stimulus bill bill goncalo']).toarray()

# News clustering example

In [None]:
# corpus of 120k news headlines, here shortened to 10k
import pandas as pd

url = "https://raw.githubusercontent.com/GoncaloJardim/ironhack-v4-data-lessons/main/data/news.csv"

all_news = pd.read_csv(url)
all_news.head()

In [None]:
all_news.shape

In [None]:
all_news.iloc[3]['news']

In [None]:
# same process as before, but for all lines
#tokenize, lowercase, remove punctuation

def tokenizer_and_remove_punctuation(row):
  tokens = word_tokenize(row['news'])
  return [word.lower() for word in tokens if word.isalpha()]

all_news['tokenized'] = all_news.apply(tokenizer_and_remove_punctuation,axis=1)
all_news.head()

In [None]:
# lemmatize with part of speech helpers

lemmatizer = WordNetLemmatizer()

def lemmatizer_with_pos(row):
  return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

all_news['lemmatized'] = all_news.apply(lemmatizer_with_pos,axis=1)
all_news.head()

In [None]:
# remove stopwords

def remove_sw(row):
  return list(set(row['lemmatized']).difference(stopwords.words()))

all_news['no_stopwords'] = all_news.apply(remove_sw,axis=1)
all_news.head()

In [None]:
# put all this cleaning together

def re_blob(row):
  return " ".join(row['no_stopwords'])

all_news['clean_blob'] = all_news.apply(re_blob,axis=1)
all_news.head()

In [None]:
#let's take only the most common 1000 words
bow_vect = CountVectorizer(max_features=1000)
# fit creates one entry for each different word seen
X = bow_vect.fit_transform(all_news['clean_blob']).toarray()

In [None]:
all_news['clean_blob'].iloc[0]

In [None]:
as_df = pd.DataFrame(X,columns=bow_vect.get_feature_names_out())
as_df.head()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6,random_state=100)
kmeans.fit(X)
pred = kmeans.predict(X)

In [None]:
predict_df = pd.concat([all_news['news'],pd.DataFrame(pred,columns=['class'])],axis=1)
predict_df.head()

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
#sports
predict_df[predict_df['class']==0]

In [None]:
#financial
predict_df[predict_df['class']==1]

In [None]:
#political news
predict_df[predict_df['class']==3]

In [None]:
#global sports
predict_df[predict_df['class']==5]