In [6]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import TfidfModel, Phrases
import nltk
from nltk.corpus import reuters
import spacy

nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [7]:
nlp = spacy.load("en_core_web_sm")
corpus = reuters.sents(categories=["cpu"])

# Convert the corpus to a list of sentences
example_text = [" ".join(sent) for sent in corpus]

texts = []
for document in example_text:
    doc = nlp(document)
    text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    texts.append(text)

dictionary = corpora.Dictionary(texts)
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Create a TF-IDF model
tfidf_model = TfidfModel(corpus_bow)
corpus_tfidf = tfidf_model[corpus_bow]

bigram = Phrases(texts, min_count=1, threshold=1)
texts_with_bigrams = [bigram[line] for line in texts]

trigram = Phrases(texts_with_bigrams, min_count=1, threshold=1)
texts_with_trigrams = [trigram[line] for line in texts_with_bigrams]

print("Bigrams:")
for text in texts_with_bigrams[:7]:
    print(text)

print("\nTrigrams:")
for text in texts_with_trigrams[:7]:
    print(text)

Bigrams:
['u_s', 'INDUSTRIAL_CAPACITY', 'USE_RATE', '81_2', 'PCT_SEPTEMBER', 'UNCHANGED_AUGUST']
['u_s', 'INDUSTRIAL_CAPACITY', 'USE_RATE', '81_2', 'PCT_SEPTEMBER', 'UNCHANGED_AUGUST']
['CANADA', 'MANUFACTURING', 'UTILIZATION', 'rate', 'rise', 'utilization', 'canadian', 'manufacturing', 'capacity', 'rise', '77', '2_pct', 'fourth', 'quarter', '1986', '77', 'pct', 'quarter', 'Statistics', 'Canada', 'say']
['change', 'small', 'mark', 'quarter', 'quarter', '1985', 'utilization', 'rate', 'manufacturing', 'rise', 'federal', 'agency', 'say']
['increase', 'residential', 'construction', 'lead', 'strong', 'increase', 'building', 'material', 'sector', 'lead', '3', '3_pct', 'increase', 'non', 'metallic', 'mineral', 'industry']
['u_s', 'INDUSTRIAL_CAPACITY', 'USE_RATE', 'ROSE_79', '8_PCT', 'FEB_79', '6_PCT', 'JAN']
['u_s', 'INDUSTRIAL_CAPACITY', 'USE_RATE', 'ROSE_79', '8_PCT', 'FEB_79', '6_PCT', 'JAN']

Trigrams:
['u_s_INDUSTRIAL_CAPACITY', 'USE_RATE_81_2', 'PCT_SEPTEMBER_UNCHANGED_AUGUST']
['u_s_I

In [8]:
df = pd.DataFrame({"text": texts, "Tfidf": corpus_tfidf}) # create a dataframe with the results
df["Tfidf"] = df["Tfidf"].apply(lambda x: sorted(x, key=lambda tup: tup[1], reverse=True)) # sort the tuples by the tfidf value
df["Tfidf"] = df["Tfidf"].apply(lambda x: [dictionary[tup[0]] for tup in x]) # convert the ids to words
df["Tfidf"] = df["Tfidf"].apply(lambda x: x[:3]) # keep only the top 3 words
# Convert the lists to strings and remove brackets
df["Tfidf"] = df["Tfidf"].apply(lambda x: ', '.join(x))
df["text"] = df["text"].apply(lambda x: ', '.join(x))

df["bigrams"] = texts_with_bigrams # add the bigrams to the dataframe
df["bigrams"] = df["bigrams"].apply(lambda x: ', '.join(x)) # convert the lists to strings and remove brackets
df["trigrams"] = texts_with_trigrams # add the bigrams and trigrams to the dataframe
df["trigrams"] = df["trigrams"].apply(lambda x: ', '.join(x)) # convert the lists to strings and remove brackets
df.head(10)

Unnamed: 0,text,Tfidf,bigrams,trigrams
0,"u, s, INDUSTRIAL, CAPACITY, USE, RATE, 81, 2, ...","AUGUST, SEPTEMBER, UNCHANGED","u_s, INDUSTRIAL_CAPACITY, USE_RATE, 81_2, PCT_...","u_s_INDUSTRIAL_CAPACITY, USE_RATE_81_2, PCT_SE..."
1,"u, s, INDUSTRIAL, CAPACITY, USE, RATE, 81, 2, ...","AUGUST, SEPTEMBER, UNCHANGED","u_s, INDUSTRIAL_CAPACITY, USE_RATE, 81_2, PCT_...","u_s_INDUSTRIAL_CAPACITY, USE_RATE_81_2, PCT_SE..."
2,"CANADA, MANUFACTURING, UTILIZATION, rate, rise...","77, quarter, rise","CANADA, MANUFACTURING, UTILIZATION, rate, rise...","CANADA, MANUFACTURING, UTILIZATION, rate, rise..."
3,"change, small, mark, quarter, quarter, 1985, u...","quarter, 1985, agency","change, small, mark, quarter, quarter, 1985, u...","change, small, mark, quarter, quarter, 1985, u..."
4,"increase, residential, construction, lead, str...","lead, increase, 3","increase, residential, construction, lead, str...","increase, residential, construction, lead, str..."
5,"u, s, INDUSTRIAL, CAPACITY, USE, RATE, ROSE, 7...","PCT, FEB, JAN","u_s, INDUSTRIAL_CAPACITY, USE_RATE, ROSE_79, 8...","u_s_INDUSTRIAL_CAPACITY, USE_RATE_ROSE_79, 8_P..."
6,"u, s, INDUSTRIAL, CAPACITY, USE, RATE, ROSE, 7...","PCT, FEB, JAN","u_s, INDUSTRIAL_CAPACITY, USE_RATE, ROSE_79, 8...","u_s_INDUSTRIAL_CAPACITY, USE_RATE_ROSE_79, 8_P..."
7,"u, s, CAPACITY, USE, RATE, 79, 8, PCT, FEBRUAR...","79, Board, FEBRUARY","u_s, CAPACITY_USE, RATE, 79_8, PCT, FEBRUARY, ...","u_s, CAPACITY_USE, RATE, 79_8, PCT, FEBRUARY, ..."
8,"Fed, previously, say, rate, 79, 7, pct, Januar...","previously, 5, 7","Fed, previously, say, rate_79, 7_pct, January,...","Fed, previously, say, rate_79, 7_pct_January, ..."
9,"surge, automobile, assembly, February, gain, p...","assembly, automobile, gain","surge, automobile, assembly, February, gain, p...","surge, automobile, assembly, February, gain, p..."


In [9]:
categories = reuters.categories() # get all the categories
categories = pd.DataFrame({"category": categories}) # create a dataframe with the categories
categories

Unnamed: 0,category
0,acq
1,alum
2,barley
3,bop
4,carcass
...,...
85,veg-oil
86,wheat
87,wpi
88,yen
