In [1]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import TfidfModel, Phrases
import nltk
from nltk.corpus import reuters
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
corpus = reuters.sents(categories=["coffee"])

# Convert the corpus to a list of sentences
example_text = [" ".join(sent) for sent in corpus]

#preprocessing of the text
texts = []
for document in example_text:
    doc = nlp(document)
    text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    texts.append(text)

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(texts)
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Create a TF-IDF model
tfidf_model = TfidfModel(corpus_bow)
corpus_tfidf = tfidf_model[corpus_bow]

bigram = Phrases(texts)
texts_with_bigrams = [bigram[line] for line in texts]

trigram = Phrases(texts_with_bigrams)
texts_with_trigrams = [trigram[line] for line in texts_with_bigrams]

print("Bigrams:")
for text in texts_with_bigrams:
        for word in text:
               if word.count("_") == 1:
                print(word)

Bigrams:
calendar_1987
tell_Reuters
palm_oil
palm_oil
palm_oil
Trade_Minister
Indonesia_s
Trade_Minister
world_s
latin_american
mln_dlrs
1_5
billion_dlrs
cent_lb
Jorge_Cardenas
National_Coffee
Growers_Federation
mln_60
kg_bag
New_York
cent_lb
export_registration
marketing_policy
Gilberto_Arango
private_exporter
tell_Reuters
Colombia_open
private_exporter
000_bag
000_bag
NIL_SAO
NIL_SAO
000_bag
000_bag
mln_dlrs
5_mln
1_5
billion_dlrs
mln_dlrs
5_mln
1_5
billion_dlrs
Uganda_s
trade_source
Uganda_s
mln_dlrs
source_say
mln_dlrs
60_kg
Uganda_s
export_earning
mln_bag
end_September
25_pct
U_s
u_s
green_coffee
60_kilo
000_bag
000_bag
end_March
calendar_1987
000_bag
000_bag
Colombia_open
private_exporter
National_Coffee
Growers_Federation
000_bag
International_Coffee
Organisation_ICO
central_american
produce_country
export_quota
Costa_Rica
source_say
International_Coffee
Organization_ICO
Coffee_Institute
mln_60
kilo_bag
export_quota
new_formula
International_Coffee
Organization_ICO
tell_Reuters


In [3]:
print("\nTrigrams:")
for text in texts_with_trigrams:
        for word in text:
               if word.count("_") == 2:
                print(word)


Trigrams:
Gilberto_Arango_president
trade_source_say
International_Coffee_Agreement
Brazilian_Coffee_Institute
International_Coffee_Organization
trade_source_say
Brazilian_Coffee_Institute
trade_source_say
trade_source_say
trade_source_say
International_Coffee_Agreement
trade_source_say
trade_source_say
trade_source_say
trade_source_say
trade_source_say
International_Coffee_Agreement
suspend_year_ago
trade_source_say
U_s_official
trade_source_say
Brazilian_Coffee_Institute
International_Coffee_Organisation
International_Coffee_Agreement
U_s_official
Kenya_s_economy
Kenya_s_economy
Kenya_s_economy
Kenya_s_economy
Kenya_s_economy
Kenya_s_economy
International_Coffee_Agreement
Gilberto_Arango_president
suspend_year_ago
trade_source_say
International_Coffee_Organization
International_Coffee_Organization
International_Coffee_Organization
International_Coffee_Organization
trade_source_say
International_Coffee_Organization
trade_source_say
Brazilian_Coffee_Institute
International_Coffee_Orga

In [4]:
df = pd.DataFrame({"text": texts }) # create a dataframe with the results

df["text"] = df["text"].apply(lambda x: ', '.join(x))
df["bigrams"] = texts_with_bigrams[:] # add the bigrams to the dataframe
df["bigrams"] = df["bigrams"].apply(lambda x: ', '.join(x[:3])) # convert the lists to strings and remove brackets
df["trigrams"] = texts_with_trigrams # add the bigrams and trigrams to the dataframe
df["trigrams"] = df["trigrams"].apply(lambda x: ', '.join(x[:2])) # convert the lists to strings and remove brackets
df.head(10)

Unnamed: 0,text,bigrams,trigrams
0,"indonesian, COMMODITY, EXCHANGE, expand, Indon...","indonesian, COMMODITY, EXCHANGE","indonesian, COMMODITY"
1,"tell, Reuters, telephone, interview, trading, ...","tell_Reuters, telephone, interview","tell_Reuters, telephone"
2,"trade, crude, palm, oil, CPO, refined, palm, o...","trade, crude, palm_oil","trade, crude"
3,"say, question, consider, Trade, Minister, Rach...","say, question, consider","say, question"
4,"fledgling, exchange, currently, trade, coffee,...","fledgling, exchange, currently","fledgling, exchange"
5,"factor, cautiously, Nainggolan, say","factor, cautiously, Nainggolan","factor, cautiously"
6,"want, slowly, safely, mistake, undermine, conf...","want, slowly, safely","want, slowly"
7,"physical, rubber, trading, launch, 1985, coffe...","physical, rubber, trading","physical, rubber"
8,"rubber, contract, trade, fob, month, forward","rubber, contract, trade","rubber, contract"
9,"robusta, coffee, grade, trade, prompt, deliver...","robusta, coffee, grade","robusta, coffee"


In [5]:
categories = reuters.categories() # get all the categories
categories = pd.DataFrame({"category": categories}) # create a dataframe with the categories
categories

Unnamed: 0,category
0,acq
1,alum
2,barley
3,bop
4,carcass
...,...
85,veg-oil
86,wheat
87,wpi
88,yen
