In [1]:
import os
import re
import nltk
import torch
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# SIMPLE SUMMARISER

In [3]:
import numpy as np
import pptx
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your PowerPoint data
pptx_filepath = 'NLP.pptx'

def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

slides_text = extract_text_from_pptx(pptx_filepath)

# Preprocess your data
corpus = []
for text in slides_text:
    sentences = nltk.sent_tokenize(text)
    preprocessed_sentences = []
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        tokens = [word for word in tokens if word.lower() not in stop_words and len(word) > 1]
        preprocessed_sentence = ' '.join(tokens)
        preprocessed_sentences.append(preprocessed_sentence)
    corpus.append(' '.join(preprocessed_sentences))

# Convert the corpus to a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Calculate cosine similarity between each pair of sentences
similarity_matrix = cosine_similarity(X, X)

# Extract the most important sentences based on cosine similarity
num_sentences = 2
important_sentences_indices = similarity_matrix.argsort(axis=1)[:, ::-1][:, :num_sentences]
important_sentences = [slides_text[index] for index in important_sentences_indices[0]]

# Join the important sentences to form the summary
summary = '\n'.join(important_sentences)

# Print the summary
print(summary)


[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Natural Language Processing
CS 3216/UG, AI 5203/PG
Week-5 Language models






15
Types of Language Models
Probabilistic language models (PLMs)

Neural language models (NLMs)


# SIMPLENN MODEL ON LESS TEXTUAL DATA

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pptx

# Load PowerPoint data
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

pptx_filepath = 'NLP.pptx'
slides_text = extract_text_from_pptx(pptx_filepath)

# Preprocess the slides
def preprocess_text(text):
    # Tokenize the text
    tokens = text.split()
    # Remove stop words and short words
    tokens = [word for word in tokens if len(word) > 1]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

preprocessed_slides = [preprocess_text(text) for text in slides_text]

# Combine preprocessed slides into a single document
corpus = ' '.join(preprocessed_slides)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_slides)

# Convert TF-IDF matrix to PyTorch tensor
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)

# Define neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Define training parameters
input_size = tfidf_tensor.shape[1]  # Input size determined by TF-IDF matrix shape
hidden_size = 128  # Hidden layer size
output_size = 2  # Output size, e.g., binary classification (summary or not)
learning_rate = 0.001
num_epochs = 10

# Initialize the model
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Convert labels to tensor 
labels = torch.tensor([1] * (len(slides_text) // 2) + [0] * (len(slides_text) - len(slides_text) // 2), dtype=torch.long)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(tfidf_tensor)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Training finished!')

# Save the trained model
torch.save(model.state_dict(), 'trained_model.pth')

# Load the trained model
model = SimpleNN(input_size, hidden_size, output_size)
model.load_state_dict(torch.load('trained_model.pth'))
model.eval()

# Generate summaries using the trained model
summary_indices = torch.argmax(model(tfidf_tensor), dim=1).tolist()
summaries = [slides_text[i] for i, label in enumerate(summary_indices) if label == 1]

# Concatenate all summaries into a single string
final_summary = '\n'.join(summaries)

# Print the final summary
print("Combined Summary of All Slides:")
print(final_summary)


Epoch [1/10], Loss: 0.6942
Epoch [2/10], Loss: 0.6913
Epoch [3/10], Loss: 0.6884
Epoch [4/10], Loss: 0.6855
Epoch [5/10], Loss: 0.6826
Epoch [6/10], Loss: 0.6795
Epoch [7/10], Loss: 0.6763
Epoch [8/10], Loss: 0.6729
Epoch [9/10], Loss: 0.6694
Epoch [10/10], Loss: 0.6656
Training finished!
Combined Summary of All Slides:
Natural Language Processing
CS 3216/UG, AI 5203/PG
Week-5 Language models






2
Recap
NLP
Applications
Regular expressions
Tokenization
Stemming
Porter Stemmer
Lemmatization
Normalization
Stopwords
Bag-of-Words
TF-IDF
NER
POS tagging
Semantics, Distributional semantics, Word2vec
8
Language Model
Classic definition- Probability distribution	over sequence of tokens
Vocabulary V – a set of tokens!

A language model p assigns each sequence of tokens (x1,….......xL) ∈
V, a probability (a number between 0 and 1),
P (x1, x2, x3, ............., xL)

The probability intuitively tells us how "good" a sequence of tokens is.
9
Language Model
Consider the probability of four strin

# SIMPLENN MODEL ON MORE TEXTUAL DATA

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pptx

# Load PowerPoint data
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

pptx_filepath = 'Module2.pptx'
slides_text = extract_text_from_pptx(pptx_filepath)

# Preprocess the slides
def preprocess_text(text):
    # Tokenize the text
    tokens = text.split()
    # Remove stop words and short words
    tokens = [word for word in tokens if len(word) > 1]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

preprocessed_slides = [preprocess_text(text) for text in slides_text]

# Combine preprocessed slides into a single document
corpus = ' '.join(preprocessed_slides)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_slides)

# Convert TF-IDF matrix to PyTorch tensor
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)

# Define neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Define training parameters
input_size = tfidf_tensor.shape[1]  # Input size determined by TF-IDF matrix shape
hidden_size = 128  # Hidden layer size
output_size = 2  # Output size, e.g., binary classification (summary or not)
learning_rate = 0.001
num_epochs = 10

# Initialize the model
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


labels = torch.tensor([1] * (len(slides_text) // 2) + [0] * (len(slides_text) - len(slides_text) // 2), dtype=torch.long)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(tfidf_tensor)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Training finished!')

# Save the trained model
torch.save(model.state_dict(), 'trained_model.pth')

# Load the trained model
model = SimpleNN(input_size, hidden_size, output_size)
model.load_state_dict(torch.load('trained_model.pth'))
model.eval()

# Generate summaries using the trained model
summary_indices = torch.argmax(model(tfidf_tensor), dim=1).tolist()
summaries = [slides_text[i] for i, label in enumerate(summary_indices) if label == 1]

# Concatenate all summaries into a single string
final_summary = '\n'.join(summaries)

# Print the final summary
print("Combined Summary of All Slides:")
print(final_summary)


Epoch [1/10], Loss: 0.6921
Epoch [2/10], Loss: 0.6883
Epoch [3/10], Loss: 0.6846
Epoch [4/10], Loss: 0.6810
Epoch [5/10], Loss: 0.6773
Epoch [6/10], Loss: 0.6735
Epoch [7/10], Loss: 0.6697
Epoch [8/10], Loss: 0.6657
Epoch [9/10], Loss: 0.6615
Epoch [10/10], Loss: 0.6572
Training finished!
Combined Summary of All Slides:
Big Data Technologies
Rahul Roy
rahul.roy@mahindrauniversity.edu.in
Functionality of Each Layer
store masses of raw data from traditional sources like OLTP databases and newer, less structured sources like log files, sensors, web analytics, documents and media archives. Eg. Hadoop HDFS, Amazon S3, MongoDB

This layer is responsible for collecting and storing data from various sources. The data ingestion process of extracting data from various sources and loading it into a data repository. Eg. Stitch, Apache Kafta, Blendo

The data processing layer optimize the data to facilitate more efficient analysis, and provide a compute engine to run the queries. Ex. Spark, Postgre

# DISTILBERT SUMMARISER WITH LESS TEXTUAL DATA

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pptx

# Load PowerPoint data
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

pptx_filepath = 'NLP.pptx'
slides_text = extract_text_from_pptx(pptx_filepath)

# Preprocess the slides
def preprocess_text(text):
    # Tokenize the text
    tokens = text.split()
    # Remove stop words and short words
    tokens = [word for word in tokens if len(word) > 1]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

preprocessed_slides = [preprocess_text(text) for text in slides_text]

# Load DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
distilbert = DistilBertModel.from_pretrained(model_name)

# Tokenize input slides
input_ids = tokenizer.batch_encode_plus(preprocessed_slides, padding=True, return_tensors="pt")["input_ids"]

# Define neural network model
class DistilBertSummarizer(nn.Module):
    def __init__(self, distilbert):
        super(DistilBertSummarizer, self).__init__()
        self.distilbert = distilbert
        self.fc = nn.Linear(distilbert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids):
        outputs = self.distilbert(input_ids)[0]  # DistilBERT output
        cls_output = outputs[:, 0, :]  # Take only the first token's output (CLS token)
        scores = self.fc(cls_output)  # Pass through linear layer
        return self.sigmoid(scores)  # Apply sigmoid activation

# Define training parameters
num_epochs = 5
learning_rate = 1e-5

# Initialize the model
model = DistilBertSummarizer(distilbert)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

labels = torch.tensor([1] * (len(slides_text) // 2) + [0] * (len(slides_text) - len(slides_text) // 2), dtype=torch.float32).unsqueeze(1)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(input_ids)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Training finished!')
# Generate summaries using the trained model
with torch.no_grad():
    outputs = model(input_ids)
    predicted_labels = torch.round(outputs).squeeze().tolist()

# Combine summaries of all slides
final_summary = ""
for slide_text, label in zip(slides_text, predicted_labels):
    if label == 1:
        final_summary += slide_text + "\n"

# Print the final summary
print("Final Summary:")
print(final_summary)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch [1/5], Loss: 0.6954
Epoch [2/5], Loss: 0.6845
Epoch [3/5], Loss: 0.6776
Epoch [4/5], Loss: 0.6695
Epoch [5/5], Loss: 0.6607
Training finished!
Final Summary:
Natural Language Processing
CS 3216/UG, AI 5203/PG
Week-5 Language models






2
Recap
NLP
Applications
Regular expressions
Tokenization
Stemming
Porter Stemmer
Lemmatization
Normalization
Stopwords
Bag-of-Words
TF-IDF
NER
POS tagging
Semantics, Distributional semantics, Word2vec
Where do you see language models?
Google Search system
11
Spell correction
The office is about fifteen minuets from my house
P(about fifteen minutes from) > P(about fifteen minuets from)
12
Machine Translation
13
P(high winds tonite) > P(large winds tonite)
Speech recognition
Here,
14
P(I saw a van) >> P(eyes awe of an)
I saw a van
15
Types of Language Models
Probabilistic language models (PLMs)

Neural language models (NLMs)
Reminder: The Chain Rule

Recall the definition of conditional probabilities

P(B|A) = P(A,B)/P(A)

Rewriting: P(A,B) = P(A)

# DISTILBERT SUMMARISER WITH MORE TEXTUAL SLIDES

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pptx

# Load PowerPoint data
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

pptx_filepath = 'Module2.pptx'
slides_text = extract_text_from_pptx(pptx_filepath)

# Preprocess the slides
def preprocess_text(text):
    # Tokenize the text
    tokens = text.split()
    # Remove stop words and short words
    tokens = [word for word in tokens if len(word) > 1]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

preprocessed_slides = [preprocess_text(text) for text in slides_text]

# Load DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
distilbert = DistilBertModel.from_pretrained(model_name)

# Tokenize input slides
input_ids = tokenizer.batch_encode_plus(preprocessed_slides, padding=True, return_tensors="pt")["input_ids"]

# Define neural network model
class DistilBertSummarizer(nn.Module):
    def __init__(self, distilbert):
        super(DistilBertSummarizer, self).__init__()
        self.distilbert = distilbert
        self.fc = nn.Linear(distilbert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids):
        outputs = self.distilbert(input_ids)[0]  # DistilBERT output
        cls_output = outputs[:, 0, :]  # Take only the first token's output (CLS token)
        scores = self.fc(cls_output)  # Pass through linear layer
        return self.sigmoid(scores)  # Apply sigmoid activation

# Define training parameters
num_epochs = 5
learning_rate = 1e-5

# Initialize the model
model = DistilBertSummarizer(distilbert)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Convert labels to tensor (example: assuming the first half of the slides are summaries, the second half are not)
labels = torch.tensor([1] * (len(slides_text) // 2) + [0] * (len(slides_text) - len(slides_text) // 2), dtype=torch.float32).unsqueeze(1)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(input_ids)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Training finished!')
# Generate summaries using the trained model
with torch.no_grad():
    outputs = model(input_ids)
    predicted_labels = torch.round(outputs).squeeze().tolist()

# Combine summaries of all slides
final_summary = ""
for slide_text, label in zip(slides_text, predicted_labels):
    if label == 1:
        final_summary += slide_text + "\n"

# Print the final summary
print("Final Summary:")
print(final_summary)



Epoch [1/5], Loss: 0.7004
Epoch [2/5], Loss: 0.6524
Epoch [3/5], Loss: 0.6217
Epoch [4/5], Loss: 0.5936
Epoch [5/5], Loss: 0.5631
Training finished!
Final Summary:
Big Data Technologies
Rahul Roy
rahul.roy@mahindrauniversity.edu.in
Big Data Technologies
Big Data Stack
Big Data Architecture
Case Study
Reference: https://uber.com/en-IN/blog/uber-big-data-platform/
7


Original Google Stack
8


Facebook Version of the Stack
9


Yahoo Version of the Stack
10


LinkedIn’s Version of the Stack
11


Cloudera Version of the Stack
Big Data Technologies
Cloudera is a commercial Hadoop distribution that includes enterprise-grade features such as Cloudera Manager for cluster management, integrated security, and data governance.

