# Article analysis with tf-idf
<p> Using frequency-inverse document frequency (tf-idf) to analyze  article content

In [2]:
import pandas as pd
import numpy as np

import sys
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [3]:
sys.path.append('..')
from preprocessing import preprocess_text

ModuleNotFoundError: No module named 'nltk'

**1. Read data**

In [183]:
df = pd.read_csv('data/Articles.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [184]:
df.shape

(2692, 4)

**2. Extract the Article column as a list**

In [185]:
articles = df['Article'].tolist()
print(articles[200])

ISLAMABAD: The federal government on Thursday agreed to slash withholding tax on bank transactions to 0.3 percent, after a successful round of talks with the representatives of traders.Federal Finance Minister, Ishaq Dar said the government is ready to cut down the withholding tax on bank transaction from 0.6 percent to 0.3.He, however, said that those traders who failed to file income tax returns by September 30 would be charged the withholding tax on bank transactions at the rate of 0.6 percent.Earlier, the government had decided in the budget of 2015-16 to charge 0.6 percent withholding tax on bank transactions of Rs50,000 or more from non-filers. 













In [186]:
len(articles)

2692

**3. Preprocess data**

In [187]:
processed_articles = [preprocess_text(article) for article in articles]

**4. Initialize and fit CountVectorizer, then convert to TfidfTransformer and fit and transform**

In [188]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

In [189]:
transformer = TfidfTransformer(norm=None)
tfidf_scores_transformed = transformer.fit_transform(counts)

**5. Initialize and fit and transform TfidfVectorizer**

In [191]:
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

**6. Check if scores match**

In [192]:
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print('Are the tf-idf scores the same? YES, they are')
else:
    print('Are the tf-idf scores the same? No, something is wrong')


Are the tf-idf scores the same? YES, they are


In [193]:
feature_names = vectorizer.get_feature_names_out()
article_index = [f"Article {i+1}" for i in range(len(articles))]
print(article_index)

['Article 1', 'Article 2', 'Article 3', 'Article 4', 'Article 5', 'Article 6', 'Article 7', 'Article 8', 'Article 9', 'Article 10', 'Article 11', 'Article 12', 'Article 13', 'Article 14', 'Article 15', 'Article 16', 'Article 17', 'Article 18', 'Article 19', 'Article 20', 'Article 21', 'Article 22', 'Article 23', 'Article 24', 'Article 25', 'Article 26', 'Article 27', 'Article 28', 'Article 29', 'Article 30', 'Article 31', 'Article 32', 'Article 33', 'Article 34', 'Article 35', 'Article 36', 'Article 37', 'Article 38', 'Article 39', 'Article 40', 'Article 41', 'Article 42', 'Article 43', 'Article 44', 'Article 45', 'Article 46', 'Article 47', 'Article 48', 'Article 49', 'Article 50', 'Article 51', 'Article 52', 'Article 53', 'Article 54', 'Article 55', 'Article 56', 'Article 57', 'Article 58', 'Article 59', 'Article 60', 'Article 61', 'Article 62', 'Article 63', 'Article 64', 'Article 65', 'Article 66', 'Article 67', 'Article 68', 'Article 69', 'Article 70', 'Article 71', 'Article 72', 

**7. Create pandas DataFrame with word counts**

In [194]:
df_word_counts = pd.DataFrame(counts.T.todense(), index=feature_names, columns=article_index)
print(df_word_counts)

              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__          0          0          0          0          0   
a300                  0          0          0          0          0   
a320                  0          0          0          0          0   
a321                  0          0          0          0          0   
a330                  0          0          0          0          0   
...                 ...        ...        ...        ...        ...   
zverev                0          0          0          0          0   
zvereva               0          0          0          0          0   
zyl                   0          0          0          0          0   
étienne               0          0          0          0          0   
über                  0          0          0          0          0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__          0          0          0          0           0  

**8. Create pandas DataFrame with tf-idf scores**

In [195]:
df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=article_index)
print(df_tf_idf)

              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__        0.0        0.0        0.0        0.0        0.0   
a300                0.0        0.0        0.0        0.0        0.0   
a320                0.0        0.0        0.0        0.0        0.0   
a321                0.0        0.0        0.0        0.0        0.0   
a330                0.0        0.0        0.0        0.0        0.0   
...                 ...        ...        ...        ...        ...   
zverev              0.0        0.0        0.0        0.0        0.0   
zvereva             0.0        0.0        0.0        0.0        0.0   
zyl                 0.0        0.0        0.0        0.0        0.0   
étienne             0.0        0.0        0.0        0.0        0.0   
über                0.0        0.0        0.0        0.0        0.0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__        0.0        0.0        0.0        0.0         0.0  

In [196]:
df_tf_idf_2 = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=article_index)
print(df_tf_idf_2)

              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__        0.0        0.0        0.0        0.0        0.0   
a300                0.0        0.0        0.0        0.0        0.0   
a320                0.0        0.0        0.0        0.0        0.0   
a321                0.0        0.0        0.0        0.0        0.0   
a330                0.0        0.0        0.0        0.0        0.0   
...                 ...        ...        ...        ...        ...   
zverev              0.0        0.0        0.0        0.0        0.0   
zvereva             0.0        0.0        0.0        0.0        0.0   
zyl                 0.0        0.0        0.0        0.0        0.0   
étienne             0.0        0.0        0.0        0.0        0.0   
über                0.0        0.0        0.0        0.0        0.0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__        0.0        0.0        0.0        0.0         0.0  

### Print highest scoring term for each article, then download into file.txt

In [203]:
for i in range(0, 20):
    highest_term = df_tf_idf[[f'Article {i+1}']].idxmax().values[0]
    print(f"Highest tf-idf term for Article {i+1}: {highest_term}\n")

Highest tf-idf term for Article 1: fare

Highest tf-idf term for Article 2: percent

Highest tf-idf term for Article 3: hong

Highest tf-idf term for Article 4: the

Highest tf-idf term for Article 5: oil

Highest tf-idf term for Article 6: arabia

Highest tf-idf term for Article 7: kse

Highest tf-idf term for Article 8: ang

Highest tf-idf term for Article 9: sugar

Highest tf-idf term for Article 10: oil

Highest tf-idf term for Article 11: yen

Highest tf-idf term for Article 12: hong

Highest tf-idf term for Article 13: the

Highest tf-idf term for Article 14: petrol

Highest tf-idf term for Article 15: price

Highest tf-idf term for Article 16: petrol

Highest tf-idf term for Article 17: notification

Highest tf-idf term for Article 18: percent

Highest tf-idf term for Article 19: ecc

Highest tf-idf term for Article 20: king



In [204]:
with open("highest_score_articles.txt", "w") as file:
    for i in range(len(articles)):
        highest_term = df_tf_idf[[f'Article {i+1}']].idxmax().values[0]
        file.write(f"Highest tf-idf term for Article {i+1}: {highest_term}\n")

print("Highest tf-idf terms saved to 'highest_score_articles.txt'")

Highest tf-idf terms saved to 'highest_score_articles.txt'
