In [1]:
pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting typing-extensions>=4.9.0 (from python-docx)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/244.3 kB 435.7 kB/s eta 0:00:01
   ---- ---------------------------------- 30.7/244.3 kB 435.7 kB/s eta 0:00:01
   ------ -------------------------------- 41.0/244.3 kB 245.8 kB/s eta 0:00:01
   -------------- ------------------------ 92.2/244.3 kB 438.1 kB/s eta 0:00:01
   ----------------- -------------------- 112.6/244.3 kB 504.4 kB/s eta 0:00:01
   ----------------- -------------------- 112.6/244.3 kB 504.4 kB/s eta 0:00:01
   ---------------------- --------------- 143.4/244.3 kB 448.2 kB/s eta 0:00:01
   ------------------------------ ------- 194.6/244.3 kB 537.4 kB/s eta 0:00:01
   -----------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Importing necessary libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import heapq
import pandas as pd
import docx  # Library to read Word documents

# Downloading required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Text Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Step 2: Keyword Extraction using TF-IDF
def extract_keywords(text):
    # Vectorizing the text using TF-IDF
    vectorizer = TfidfVectorizer(max_features=10)
    X = vectorizer.fit_transform([text])
    
    # Get feature names (keywords) and their scores
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = X.toarray()[0]
    
    # Creating a DataFrame for better visualization
    df = pd.DataFrame({'Keyword': feature_names, 'TF-IDF Score': tfidf_scores})
    return df

# Step 3: Text Summarization
def summarize_text(text, num_sentences=3):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Compute the TF-IDF matrix for the sentences
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Rank sentences based on the sum of TF-IDF scores
    sentence_scores = X.sum(axis=1)
    
    # Select the top 'num_sentences' sentences with the highest scores
    top_sentence_indices = heapq.nlargest(num_sentences, range(len(sentence_scores)), key=sentence_scores.__getitem__)
    
    # Return the summary
    summary = [sentences[i] for i in top_sentence_indices]
    return ' '.join(summary)


# Function to read text from .docx file
def read_docx(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)


if __name__ == "__main__":
    # Reading content from the uploaded .doc file (converted to .docx)
    file_path = "text.docx"  # Make sure the file is in the appropriate format
    content = read_docx(file_path)

    # Preprocessing text
    cleaned_text = preprocess_text(content)

    # Extracting keywords
    keywords_df = extract_keywords(cleaned_text)
    print("Extracted Keywords:")
    display(keywords_df)

    # Summarizing text
    summary = summarize_text(content, num_sentences=5)
    print("\nSummary:")
    print(summary)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Extracted Keywords:


Unnamed: 0,Keyword,TF-IDF Score
0,complex,0.332182
1,jahan,0.249136
2,mahal,0.498273
3,mausoleum,0.332182
4,million,0.249136
5,mughal,0.332182
6,shah,0.249136
7,taj,0.332182
8,tomb,0.249136
9,world,0.249136



Summary:
The Taj Mahal complex is believed to have been completed in its entirety in 1653 at a cost estimated at the time to be around ₹5 million, which in 2023 would be approximately ₹35 billion (US$77.8 million). The tomb is the centrepiece of a 17-hectare (42-acre) complex, which includes a mosque and a guest house, and is set in formal gardens bounded on three sides by a crenellated wall. While the mausoleum is constructed of white marble inlaid with semi-precious stones, red sandstone was used for other buildings in the complex similar to the Mughal era buildings of the time. The construction project employed more than 20,000 workers and artisans under the guidance of a board of architects led by Ustad Ahmad Lahori, the emperor's court architect. It was commissioned in 1631 by the fifth Mughal emperor, Shah Jahan (r. 1628–1658) to house the tomb of his beloved wife, Mumtaz Mahal; it also houses the tomb of Shah Jahan himself.
