In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from docx import Document

In [2]:
def read_word_files(file_paths):
    all_content = []
    for file_path in file_paths:
        try:
            doc = Document(file_path)
            text = []
            for paragraph in doc.paragraphs:
                text.append(paragraph.text)
            all_content.append('\n'.join(text))
        except Exception as e:
            print(f"Error reading the Word file '{file_path}': {e}")
    return all_content

word_file_paths = [r"D:\code\NLP\doc\Doc1.docx", r"D:\code\NLP\doc\Doc 2.docx", r"D:\code\NLP\doc\Doc 3.docx", r"D:\code\NLP\doc\Doc 4.docx", r"D:\code\NLP\doc\Doc 5.docx", r"D:\code\NLP\doc\Doc 6.docx"]


contents = read_word_files(word_file_paths)

def tokenize_and_remove_stopwords(contents):
    stop_words = set(stopwords.words('english'))
    tokenized_docs = []
    for doc_content in contents:
        words = word_tokenize(doc_content)
        words = [word for word in words if word.lower() not in stop_words]
        sentences = sent_tokenize(doc_content)
        tokenized_docs.append((words, sentences))
    return tokenized_docs

tokenized_docs_no_stopwords = tokenize_and_remove_stopwords(contents)

for i, (words, sentences) in enumerate(tokenized_docs_no_stopwords, start=1):
    print(f"\nTokens for Document {i} after removing stopwords:")
    print("Words:", words)
    print("Sentences:", sentences)


Tokens for Document 1 after removing stopwords:
Words: ['Formula', 'One', ',', 'commonly', 'known', 'Formula', '1', 'F1', ',', 'highest', 'class', 'international', 'racing', 'open-wheel', 'single-seater', 'formula', 'racing', 'cars', 'sanctioned', 'Fédération', 'Internationale', 'de', "l'Automobile", '(', 'FIA', ')', '.', 'FIA', 'Formula', 'One', 'World', 'Championship', 'one', 'premier', 'forms', 'racing', 'around', 'world', 'since', 'inaugural', 'running', '1950', '.', 'word', 'formula', 'name', 'refers', 'set', 'rules', 'participants', "'", 'cars', 'must', 'conform', '.', 'Formula', 'One', 'season', 'consists', 'series', 'races', ',', 'known', 'Grands', 'Prix', '.', 'Grands', 'Prix', 'take', 'place', 'multiple', 'countries', 'continents', 'around', 'world', 'either', 'purpose-built', 'circuits', 'closed', 'public', 'roads', '.', 'point-system', 'used', 'Grands', 'Prix', 'determine', 'two', 'annual', 'World', 'Championships', ':', 'one', 'drivers', ',', 'one', 'constructors', '(', '

In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
def select_content_tfidf(tokenized_docs, num_sentences):
    documents = [' '.join(words) for words, _ in tokenized_docs]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    word_tfidf_means = np.mean(tfidf_matrix, axis=0)
    word_tfidf_means = np.array(word_tfidf_means).reshape(-1)
    top_word_indices = np.argsort(word_tfidf_means)[::-1][:5]  

    print("Top 5 words based on TF-IDF scores:")
    for idx in top_word_indices[:5]:
        word = feature_names[idx]
        tfidf_score = word_tfidf_means[idx]
        print(f"{word}: {tfidf_score}")
    
    selected_sentences = []
    for doc_idx, (words, _) in enumerate(tokenized_docs):
        doc_content = ' '.join(words)
        sentences = sent_tokenize(doc_content)
        sentence_tfidf_scores = []
        for sentence in sentences:
            sentence_words = word_tokenize(sentence.lower())
            sentence_tfidf = 0
            for word_idx in top_word_indices:
                word = feature_names[word_idx]
                if word in sentence_words:
                    word_tfidf = word_tfidf_means[word_idx]
                    sentence_tfidf += word_tfidf
            sentence_tfidf_scores.append(sentence_tfidf)
        top_sentence_indices = np.argsort(sentence_tfidf_scores)[::-1][:num_sentences]
        selected_sentences.extend([sentences[idx] for idx in top_sentence_indices])
    return selected_sentences

selected_sentences_tfidf = select_content_tfidf(tokenized_docs_no_stopwords, num_sentences=2)
print("\nSelected Sentences based on TF-IDF:")
for i, sentence in enumerate(selected_sentences_tfidf, start=1):
    print(f"{i}. {sentence}")

Top 5 words based on TF-IDF scores:
one: 0.09446063755636129
lap: 0.09063245606237069
prix: 0.07810972780528723
formula: 0.07234073035134353
chassis: 0.061396642684729705

Selected Sentences based on TF-IDF:
1. Formula One season consists series races , known Grands Prix .
2. point-system used Grands Prix determine two annual World Championships : one drivers , one constructors ( teams ) .
3. driver died injuries sustained track wheel Formula One car 20 years 2014 Japanese Grand Prix , Jules Bianchi collided recovery vehicle aquaplaning circuit , dying nine months later injuries .
4. Since 1994 , three track marshals died , one 2000 Italian Grand Prix , [ 31 ] second 2001 Australian Grand Prix [ 31 ] third 2013 Canadian Grand Prix .
5. 2016 season also saw new team , Haas , join grid , Max Verstappen became youngest-ever race winner age 18 Spain .
6. team continued form following two seasons , winning 16 races 2015 taking record 19 wins 2016 , Hamilton claiming title former year Rosber