In [None]:
import re
import requests
from bs4 import BeautifulSoup
import json

def extract_information(url):
    response = requests.get(url)
    if response.status_code == 200:
        html_content = response.text
    else:
        print("Failed to retrieve the webpage:", response.status_code)
        return None

    soup = BeautifulSoup(html_content, "html.parser")

    # Extract Title
    title = soup.title.string if soup.title else ""

    # Extract Meta Description
    meta_description = soup.find("meta", attrs={"name": "description"})
    meta_description = meta_description["content"] if meta_description else ""

    # Extract Header Tags
    header_tags = [header.text.strip() for header in soup.find_all(["h1", "h2", "h3"])]

    # Extract Text Content
    text_content = soup.get_text(strip=True)

    # Extract Images
    images = [image['src'] for image in soup.find_all('img')]

    # Extract Links
    links = [link['href'] for link in soup.find_all('a', href=True)]

    # Extract Contact Information
    contact_info = re.findall(r'(\+\d{1,3}\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', html_content)
    contact_info = [re.sub(r'[^\d-]', '', info) for info in contact_info]

    # Create a dictionary to store the extracted information
    data = {
        "url": url,
        "title": title,
        "meta_description": meta_description,
        "header_tags": header_tags,
        "text_content": text_content,
        "images": images,
        "links": links,
        "contact_info": contact_info
    }

    return data


def save_to_json(data, output_file):
    with open(output_file, "w") as json_file:
        json.dump(data, json_file, indent=4)
    print("Data saved to", output_file)


# Example usage
url = input("Enter the Url").strip()
output_file = "website_data.json"

extracted_data = extract_information(url)
if extracted_data:
    save_to_json(extracted_data, output_file)

Enter the Urlhttps://www.amazon.in/
Data saved to website_data.json


In [None]:
import json

# Define the path to the JSON file
json_file = "website_data.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)


{'url': 'https://www.amazon.in/', 'title': 'Online Shopping site in India: Shop Online for Mobiles, Books, Watches, Shoes and More - Amazon.in', 'meta_description': 'Amazon.in: Online Shopping India - Buy mobiles, laptops, cameras, books, watches, apparel, shoes and e-Gift Cards. Free Shipping & Cash on Delivery Available.', 'header_tags': ['Makeup products', 'New looks for the new season', 'Do up your home', 'Smart gadgets by Amazon', 'Value bazaar', 'Work from home essentials', 'Revamp your home in style', 'Innovations from Emerging Indian Brands'], 'text_content': "Online Shopping site in India: Shop Online for Mobiles, Books, Watches, Shoes and More - Amazon.inSkip to main content.inHelloSelect your addressAllSelect the department you want to search inAll CategoriesAlexa SkillsAmazon DevicesAmazon FashionAmazon FreshAmazon PharmacyAppliancesApps & GamesBabyBeautyBooksCar & MotorbikeClothing & AccessoriesCollectiblesComputers & AccessoriesElectronicsFurnitureGarden & OutdoorsGift Ca

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import json
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the JSON data
json_file = "website_data.json"
with open(json_file) as file:
    data = json.load(file)

# Preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    cleaned_text = re.sub('<[^<]+?>', '', text)
    # Remove special characters and numbers
    cleaned_text = re.sub('[^a-zA-Z]', ' ', cleaned_text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    # Tokenize the text
    tokens = word_tokenize(cleaned_text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the tokens back to a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Preprocess the text content
cleaned_text = preprocess_text(data['text_content'])

# Preprocess meta_description
preprocessed_meta_description = preprocess_text(data['meta_description'])

# Preprocess images
# Preprocess images
preprocessed_images = []
for image in data['images']:
    # Remove special characters and additional patterns
    cleaned_image = re.sub(r'[^a-zA-Z\s]', ' ', image)
    # Remove URLs starting with http:// or https://
    cleaned_image = re.sub(r'http?://\S+|www\.\S+', '', cleaned_image)
    # Remove image formats like .jpg, .png, etc.
    cleaned_image = re.sub(r'\.\w+', ' ', cleaned_image)
    # Convert to lowercase
    cleaned_image = cleaned_image.lower()
    # Tokenize the text
    tokens = word_tokenize(cleaned_image)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the tokens back to a single string
    cleaned_image = ' '.join(tokens)
    preprocessed_images.append(cleaned_image)


# Preprocess links
preprocessed_links = [re.sub(r'https?://', '', link) for link in data['links']]
preprocessed_links = [preprocess_text(link) for link in preprocessed_links]

# Preprocess contact_info
preprocessed_contact_info = [info for info in data['contact_info'] if any(c.isalnum() for c in info)]

# Create a new dictionary to store the preprocessed data
preprocessed_data = {
    "url": data["url"],
    "cleaned_text": cleaned_text,
    "preprocessed_meta_description": preprocessed_meta_description,
    "preprocessed_images": preprocessed_images,
    "preprocessed_links": preprocessed_links,
    "preprocessed_contact_info": preprocessed_contact_info
}

# Remove empty lists from preprocessed_data
preprocessed_data = {key: value for key, value in preprocessed_data.items() if value or key == "cleaned_text"}

# Save the preprocessed data to a JSON file
output_file = "preprocessed_data.json"
with open(output_file, "w") as json_file:
    json.dump(preprocessed_data, json_file, indent=4)

print("Preprocessed data saved to", output_file)


Preprocessed data saved to preprocessed_data.json


In [None]:
import json

# Define the path to the JSON file
json_file = "preprocessed_data.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)


{'url': 'https://www.amazon.in/', 'cleaned_text': 'online shopping site india shop online mobile book watch shoe amazon inskip main content inhelloselect addressallselect department want search inall categoriesalexa skillsamazon devicesamazon fashionamazon freshamazon pharmacyappliancesapps gamesbabybeautybookscar motorbikeclothing accessoriescollectiblescomputers accessorieselectronicsfurnituregarden outdoorsgift cardsgrocery gourmet foodshealth personal carehome kitchenindustrial scientificjewellerykindle storeluggage bagsluxury beautymovies tv showsmusicmusical instrumentsoffice productspet suppliesprime videoshoes handbagssoftwaresports fitness outdoorssubscribe savetools home improvementtoys gamesunder video gameswatchessearch amazon inenhello sign inaccount listsreturns orderscartallamazon minitvsellbest sellersmobilestoday dealscustomer servicenew releasesprimeelectronicshome kitchenamazon paygift ideasfashioncomputersbookscouponsbeauty personal caretoys gamessports fitness outd

In [None]:
# NLP Analysis
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
import json
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.probability import FreqDist
from nltk import ngrams

# Load the preprocessed data
json_file = "preprocessed_data.json"
with open(json_file) as file:
    preprocessed_data = json.load(file)

# Analyze sentiment
text = preprocessed_data['cleaned_text']
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
print("Sentiment Scores:", sentiment_scores)

# Perform frequency distribution
tokens = nltk.word_tokenize(text)
fdist = FreqDist(tokens)
print("Most Common Words:", fdist.most_common(50))

# Perform n-gram analysis
n = 2
ngrams_list = list(ngrams(tokens, n))
print(f"Most Common {n}-grams:", nltk.FreqDist(ngrams_list).most_common(50))


Sentiment Scores: {'neg': 0.006, 'neu': 0.888, 'pos': 0.106, 'compound': 0.9628}
Most Common Words: [('amazon', 4), ('page', 4), ('personal', 3), ('home', 3), ('look', 3), ('back', 3), ('online', 2), ('gourmet', 2), ('tv', 2), ('fitness', 2), ('recently', 2), ('viewed', 2), ('item', 2), ('featured', 2), ('recommendation', 2), ('view', 2), ('edit', 2), ('browsing', 2), ('historyafter', 2), ('viewing', 2), ('product', 2), ('detail', 2), ('find', 2), ('easy', 2), ('way', 2), ('navigate', 2), ('interested', 2), ('million', 2), ('ad', 2), ('shopping', 1), ('site', 1), ('india', 1), ('shop', 1), ('mobile', 1), ('book', 1), ('watch', 1), ('shoe', 1), ('inskip', 1), ('main', 1), ('content', 1), ('inhelloselect', 1), ('addressallselect', 1), ('department', 1), ('want', 1), ('search', 1), ('inall', 1), ('categoriesalexa', 1), ('skillsamazon', 1), ('devicesamazon', 1), ('fashionamazon', 1)]
Most Common 2-grams: [(('recently', 'viewed'), 2), (('viewed', 'item'), 2), (('item', 'featured'), 2), (('f

In [None]:
import json
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.probability import FreqDist
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize

# Load the preprocessed data
json_file = "preprocessed_data.json"
with open(json_file) as file:
    preprocessed_data = json.load(file)

# Dictionary to store the NLP analysis results
nlp_analysis = {}

# Perform NLP analysis on each field
for field, text in preprocessed_data.items():
    analysis_result = {}

    # Handle cases where text is a list
    if isinstance(text, list):
        text = ' '.join(text)

    # Sentiment Analysis
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    analysis_result["Sentiment Scores"] = sentiment_scores

    # Frequency Distribution
    tokens = word_tokenize(text)
    fdist = FreqDist(tokens)
    analysis_result["Most Common Words"] = fdist.most_common(50)

    # N-gram Analysis
    n = 2
    ngrams_list = list(ngrams(tokens, n))
    analysis_result[f"Most Common {n}-grams"] = nltk.FreqDist(ngrams_list).most_common(50)

    # Tokenization
    analysis_result["Tokens"] = tokens

    # Sentence Tokenization
    sentences = sent_tokenize(text)
    analysis_result["Sentences"] = sentences

    # Part-of-Speech (POS) Tagging
    pos_tags = nltk.pos_tag(tokens)
    analysis_result["POS Tags"] = pos_tags

    # Named Entity Recognition (NER)
    nltk.download('maxent_ne_chunker')
    nltk.download('words')
    ner_tags = nltk.ne_chunk(pos_tags)
    analysis_result["NER Tags"] = ner_tags

    # Add the analysis results to the dictionary
    nlp_analysis[field] = analysis_result

# Store the NLP analysis results of all fields in a single variable
combined_analysis = {}

for field, analysis_result in nlp_analysis.items():
    for analysis_type, result in analysis_result.items():
        if analysis_type not in combined_analysis:
            combined_analysis[analysis_type] = []
        combined_analysis[analysis_type].append(result)

# Print the combined NLP analysis results
for analysis_type, results in combined_analysis.items():
    print(analysis_type + ":")
    print(results)
    print("-" * 50)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nl

Sentiment Scores:
[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.006, 'neu': 0.888, 'pos': 0.106, 'compound': 0.9628}, {'neg': 0.0, 'neu': 0.669, 'pos': 0.331, 'compound': 0.7845}, {'neg': 0.024, 'neu': 0.853, 'pos': 0.123, 'compound': 0.9705}, {'neg': 0.005, 'neu': 0.866, 'pos': 0.128, 'compound': 0.9974}]
--------------------------------------------------
Most Common Words:
[[('https', 1), (':', 1), ('//www.amazon.in/', 1)], [('amazon', 4), ('page', 4), ('personal', 3), ('home', 3), ('look', 3), ('back', 3), ('online', 2), ('gourmet', 2), ('tv', 2), ('fitness', 2), ('recently', 2), ('viewed', 2), ('item', 2), ('featured', 2), ('recommendation', 2), ('view', 2), ('edit', 2), ('browsing', 2), ('historyafter', 2), ('viewing', 2), ('product', 2), ('detail', 2), ('find', 2), ('easy', 2), ('way', 2), ('navigate', 2), ('interested', 2), ('million', 2), ('ad', 2), ('shopping', 1), ('site', 1), ('india', 1), ('shop', 1), ('mobile', 1), ('book', 1), ('watch', 1), ('shoe', 1)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
import json

# Store the NLP analysis results of all fields in a single variable
combined_analysis = {}

for field, analysis_result in nlp_analysis.items():
    for analysis_type, result in analysis_result.items():
        if analysis_type == "Sentiment Scores":
            if analysis_type not in combined_analysis:
                combined_analysis[analysis_type] = []
            combined_analysis[analysis_type].append(result)
        else:
            if analysis_type not in combined_analysis:
                combined_analysis[analysis_type] = []
            combined_analysis[analysis_type].extend(result)

# Calculate the average sentiment scores
average_sentiment_scores = {}
for sentiment_scores_list in combined_analysis["Sentiment Scores"]:
    for sentiment_type, sentiment_score in sentiment_scores_list.items():
        if sentiment_type not in average_sentiment_scores:
            average_sentiment_scores[sentiment_type] = 0
        average_sentiment_scores[sentiment_type] += sentiment_score

num_fields = len(preprocessed_data)
for sentiment_type, sentiment_score in average_sentiment_scores.items():
    average_sentiment_scores[sentiment_type] = sentiment_score / num_fields

# Combine average sentiment scores and other analysis results
combined_results = {"Average Sentiment Scores": average_sentiment_scores}
for analysis_type, results in combined_analysis.items():
    if analysis_type != "Sentiment Scores":
        combined_results[analysis_type] = results

# Save the combined results to a JSON file
output_file = "combined_analysis_results.json"
with open(output_file, "w") as json_file:
    json.dump(combined_results, json_file, indent=4)

print("Combined analysis results saved to", output_file)


Combined analysis results saved to combined_analysis_results.json


In [None]:
import json

# Define the path to the JSON file
json_file = "combined_analysis_results.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)

{'Average Sentiment Scores': {'neg': 0.006999999999999999, 'neu': 0.8552, 'pos': 0.1376, 'compound': 0.7430399999999999}, 'Most Common Words': [['https', 1], [':', 1], ['//www.amazon.in/', 1], ['amazon', 4], ['page', 4], ['personal', 3], ['home', 3], ['look', 3], ['back', 3], ['online', 2], ['gourmet', 2], ['tv', 2], ['fitness', 2], ['recently', 2], ['viewed', 2], ['item', 2], ['featured', 2], ['recommendation', 2], ['view', 2], ['edit', 2], ['browsing', 2], ['historyafter', 2], ['viewing', 2], ['product', 2], ['detail', 2], ['find', 2], ['easy', 2], ['way', 2], ['navigate', 2], ['interested', 2], ['million', 2], ['ad', 2], ['shopping', 1], ['site', 1], ['india', 1], ['shop', 1], ['mobile', 1], ['book', 1], ['watch', 1], ['shoe', 1], ['inskip', 1], ['main', 1], ['content', 1], ['inhelloselect', 1], ['addressallselect', 1], ['department', 1], ['want', 1], ['search', 1], ['inall', 1], ['categoriesalexa', 1], ['skillsamazon', 1], ['devicesamazon', 1], ['fashionamazon', 1], ['amazon', 1], 

In [None]:
import json

# Load the combined analysis results from the JSON file
json_file = "combined_analysis_results.json"
with open(json_file) as file:
    combined_analysis = json.load(file)

# Extract the desired fields
top_keywords = combined_analysis["Top Keywords"]
top_phrases = combined_analysis["Top Phrases"]
named_entities = combined_analysis["Named Entities"]

# Print the extracted fields
print("Top Keywords:")
print(top_keywords)
print("-" * 50)

print("Top Phrases:")
print(top_phrases)
print("-" * 50)

print("Named Entities:")
print(named_entities)
print("-" * 50)


KeyError: ignored

In [None]:
from nltk import FreqDist, ngrams
from nltk.corpus import stopwords

# Define the number of words in a phrase
phrase_length = 2

# Define stopwords to be ignored during phrase extraction
stopwords = set(stopwords.words('english'))

# Extract keywords
keywords = word_tokenize(cleaned_content)
keyword_freq = FreqDist(keywords)
top_keywords = keyword_freq.most_common(10)  # Extract top 10 most frequent keywords

# Extract important phrases
phrase_freq = FreqDist()
phrases = ngrams(tokens, phrase_length)
for phrase in phrases:
    if all(word not in stopwords for word in phrase):
        phrase_freq[tuple(phrase)] += 1

top_phrases = phrase_freq.most_common(10)  # Extract top 10 important phrases

# Extract named entities (remaining code remains the same)
entities = [entity for entity in ner_tags if hasattr(entity, 'label')]
named_entities = [ne[0] for entity in entities for ne in entity.leaves()]

# Create a dictionary to store the extracted features
extracted_features = {
    "Top Keywords": top_keywords,
    "Top Phrases": top_phrases,
    "Named Entities": named_entities
}

# Specify the file path
output_file_path = "output.json"

# Save the dictionary as JSON
with open(output_file_path, "w") as f:
    json.dump(extracted_features, f)

print("Output data saved to:", output_file_path)

Output data saved to: output.json


'# Print the extracted features\nfor feature, values in extracted_features.items():\n    print(feature + ":")\n    for value in values:\n        print(value)\n    print() '

In [None]:
import json

# Define the path to the JSON file
json_file = "output.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)

{'Top Keywords': [['amazon', 4], ['page', 4], ['personal', 3], ['home', 3], ['look', 3], ['back', 3], ['online', 2], ['gourmet', 2], ['tv', 2], ['fitness', 2]], 'Top Phrases': [[['recently', 'viewed'], 2], [['viewed', 'item'], 2], [['item', 'featured'], 2], [['featured', 'recommendation'], 2], [['recommendation', 'view'], 2], [['view', 'edit'], 2], [['edit', 'browsing'], 2], [['browsing', 'historyafter'], 2], [['historyafter', 'viewing'], 2], [['viewing', 'product'], 2]], 'Named Entities': []}


In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer

# Read data from JSON file
with open('output.json', 'r') as file:
    data = json.load(file)

# Extract keywords and phrases from the data
keywords = data['Top Keywords']
phrases = data['Top Phrases']

# Flatten the list of keywords and phrases
keywords = [word for sublist in keywords for word in sublist]
phrases = [phrase for sublist in phrases for phrase in sublist]

# Convert the keywords and phrases to string type
keywords = [str(word) for word in keywords]
phrases = [str(phrase) for phrase in phrases]

# Combine keywords and phrases
combined_data = keywords + phrases

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the combined data to obtain the TF-IDF representation
tfidf_matrix = vectorizer.fit_transform(combined_data)

# Get the feature names (keywords) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# List to store documents with keywords
documents_with_keywords = []

# Iterate over the documents
for i, doc in enumerate(combined_data):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    if len(feature_index) > 0:
        # Document contains keywords, add it to the list
        documents_with_keywords.append(doc)

# Update combined_data with the filtered documents
combined_data = documents_with_keywords

# Update tfidf_matrix with the filtered documents
tfidf_matrix = vectorizer.transform(combined_data)

# Get the feature names (keywords) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# List to store the top keywords for each document
top_keywords_per_document = []

# Print the top keywords with highest TF-IDF scores
num_keywords = 10  # Number of top keywords to extract
for i, doc in enumerate(combined_data):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    top_keywords = [feature_names[i] for i, _ in sorted_tfidf_scores[:num_keywords]]
    top_keywords_per_document.append(top_keywords)

# Save the output to a JSON file
output = {'Top Keywords per Document': top_keywords_per_document}

with open('keyword_extraction_output.json', 'w') as file:
    json.dump(output, file)

print("Keyword extraction completed. Output saved to keyword_extraction_output.json.")

Keyword extraction completed. Output saved to keyword_extraction_output.json.


In [None]:
import json

# Specify the path to your JSON file
json_file_path = 'keyword_extraction_output.json'  # Replace with the actual path to your JSON file

# Read the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

json_data['ad_extensions'] = [
    {
      "extension_id": 1,
      "extension_text": "Discover the best deals on shoes and accessories. Shop now!",
      "category":"site extensions"
    },
    {
      "extension_id": 2,
      "extension_text": "Upgrade your gaming experience with our latest collection. Buy now!",
      "category":"location extension"
    },
    {
      "extension_id": 3,
      "extension_text": "Upgrade your gaming experience with our latest collection. Buy now!",
      "category":"call extension"
    },
    {
      "extension_id": 4,
      "extension_text": "Upgrade your gaming experience with our latest collection. Buy now!",
      "category":"callout extension"
    },
    {
      "extension_id": 5,
      "extension_text": "Get fit and stay healthy with our fitness equipment. Shop today!",
      "category":"service extensions"
    }
  ]
with open(json_file_path, 'w') as file:
    json.dump(json_data, file, indent=4)

In [None]:
import json

# Define the path to the JSON file
json_file = "keyword_extraction_output.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)

{'Top Keywords per Document': [['amazon'], ['page'], ['personal'], ['home'], ['look'], ['back'], ['online'], ['gourmet'], ['tv'], ['fitness'], ['recently', 'viewed'], ['viewed', 'item'], ['item', 'featured'], ['recommendation', 'featured'], ['view', 'recommendation'], ['view', 'edit'], ['edit', 'browsing'], ['historyafter', 'browsing'], ['viewing', 'historyafter'], ['product', 'viewing']], 'ad_extensions': [{'extension_id': 1, 'extension_text': 'Discover the best deals on shoes and accessories. Shop now!', 'category': 'site extensions'}, {'extension_id': 2, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'location extension'}, {'extension_id': 3, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'call extension'}, {'extension_id': 4, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'callout extension'}, {'extension_id': 5, 'extensio

In [None]:
import json

# Read data from JSON file
with open('keyword_extraction_output.json', 'r') as file:
    data = json.load(file)

# Extract ad extensions and keywords from the data
ad_extensions = data['ad_extensions']
keywords = data['Top Keywords per Document']

# Prepare the data in the desired format for training the model
X = keywords  # Input features (keywords)
y = ad_extensions  # Output labels (ad extensions)

# Print the first few examples
for i in range(5):
    print(f"Example {i+1}:")
    print("Keywords:", X[i])
    print("Ad Extension:", y[i])
    print()

# Further processing or splitting into training/testing sets can be done as needed


Example 1:
Keywords: ['amazon']
Ad Extension: {'extension_id': 1, 'extension_text': 'Discover the best deals on shoes and accessories. Shop now!', 'category': 'site extensions'}

Example 2:
Keywords: ['page']
Ad Extension: {'extension_id': 2, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'location extension'}

Example 3:
Keywords: ['personal']
Ad Extension: {'extension_id': 3, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'call extension'}

Example 4:
Keywords: ['home']
Ad Extension: {'extension_id': 4, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'callout extension'}

Example 5:
Keywords: ['look']
Ad Extension: {'extension_id': 5, 'extension_text': 'Get fit and stay healthy with our fitness equipment. Shop today!', 'category': 'service extensions'}



In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer

# Read data from JSON file
with open('keyword_extraction_output.json', 'r') as file:
    data = json.load(file)

# Extract keywords from the data
keywords = data['Top Keywords per Document']

# Convert the keywords to string format
keywords = [' '.join(keyword) for keyword in keywords]

# Create a CountVectorizer to convert keywords into a bag-of-words representation
vectorizer = CountVectorizer()

# Fit and transform the keywords to obtain the bag-of-words representation
features = vectorizer.fit_transform(keywords)

# Convert the bag-of-words representation to a numerical feature matrix
feature_matrix = features.toarray()

# Print the feature matrix
print("Feature Matrix:")
print(feature_matrix)


Feature Matrix:
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1]]


In [None]:
import json

# Read data from JSON file
with open('keyword_extraction_output.json', 'r') as file:
    data = json.load(file)

# Add the feature matrix to the data
data['Feature Matrix'] = feature_matrix.tolist()

# Save the updated data to the JSON file
with open('keyword_extraction_output.json', 'w') as file:
    json.dump(data, file, indent=4)


In [None]:
import json

# Define the path to the JSON file
json_file = "keyword_extraction_output.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)

{'Top Keywords per Document': [['amazon'], ['page'], ['personal'], ['home'], ['look'], ['back'], ['online'], ['gourmet'], ['tv'], ['fitness'], ['recently', 'viewed'], ['viewed', 'item'], ['item', 'featured'], ['recommendation', 'featured'], ['view', 'recommendation'], ['view', 'edit'], ['edit', 'browsing'], ['historyafter', 'browsing'], ['viewing', 'historyafter'], ['product', 'viewing']], 'ad_extensions': [{'extension_id': 1, 'extension_text': 'Discover the best deals on shoes and accessories. Shop now!', 'category': 'site extensions'}, {'extension_id': 2, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'location extension'}, {'extension_id': 3, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'call extension'}, {'extension_id': 4, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'callout extension'}, {'extension_id': 5, 'extensio

In [None]:
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Read the ad_extensions_data.json file
with open('keyword_extraction_output.json', 'r') as file:
    data = json.load(file)

# Extract the ad_extensions from the data
ad_extensions = data["ad_extensions"]

# Extract the category labels from the ad_extensions
categories = [extension["category"] for extension in ad_extensions]

# Perform label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(categories)

# Update the ad_extensions with the encoded labels
for i, extension in enumerate(ad_extensions):
    extension["category_encoded"] = int(encoded_labels[i])

# Save the updated ad_extensions_data.json file to a new file
output_file = 'ad_extensions_data_encoded.json'
with open(output_file, 'w') as file:
    json.dump(data, file, indent=4)

print(f"Encoded ad_extensions data saved to {output_file}")


Encoded ad_extensions data saved to ad_extensions_data_encoded.json


In [None]:
import json

# Define the path to the JSON file
json_file = "ad_extensions_data_encoded.json"

# Load the JSON data
with open(json_file) as file:
    data = json.load(file)

# Print the loaded data
print(data)

{'Top Keywords per Document': [['amazon'], ['page'], ['personal'], ['home'], ['look'], ['back'], ['online'], ['gourmet'], ['tv'], ['fitness'], ['recently', 'viewed'], ['viewed', 'item'], ['item', 'featured'], ['recommendation', 'featured'], ['view', 'recommendation'], ['view', 'edit'], ['edit', 'browsing'], ['historyafter', 'browsing'], ['viewing', 'historyafter'], ['product', 'viewing']], 'ad_extensions': [{'extension_id': 1, 'extension_text': 'Discover the best deals on shoes and accessories. Shop now!', 'category': 'site extensions', 'category_encoded': 4}, {'extension_id': 2, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'location extension', 'category_encoded': 2}, {'extension_id': 3, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy now!', 'category': 'call extension', 'category_encoded': 0}, {'extension_id': 4, 'extension_text': 'Upgrade your gaming experience with our latest collection. Buy n

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Load the data
with open('ad_extensions_data_encoded.json', 'r') as file:
    data = json.load(file)

# Extract the features and labels
keywords = data["Feature Matrix"]
ad_extensions = data["ad_extensions"]

# Check the number of samples in each data
num_samples_keywords = len(keywords)
num_samples_ad_extensions = len(ad_extensions)

# Determine the minimum number of samples
min_num_samples = min(num_samples_keywords, num_samples_ad_extensions)

# Trim the data to have the same number of samples
keywords = keywords[:min_num_samples]
ad_extensions = ad_extensions[:min_num_samples]

# Extract the labels
labels = [extension["category_encoded"] for extension in ad_extensions]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(keywords, labels, test_size=0.2, random_state=42)

# Convert the lists to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Print the shapes of the data arrays
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)


Training data shape: (4, 21) (4,)
Testing data shape: (1, 21) (1,)


In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4, 25)
(4,)
(1, 25)
(1,)


In [None]:
import numpy as np

X = np.array(X)  # Convert X to a NumPy array
y = np.array(y)  # Convert y to a NumPy array

X = X.T  # Transpose the feature matrix
y = y.reshape(-1, 1)  # Reshape the label array to match the number of samples in X

# Verify the new shapes of X and y
print(X.shape)
print(y.shape)

(21,)
(5, 1)


  X = np.array(X)  # Convert X to a NumPy array


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Reshape the input arrays
#X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
#X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut

# Define the random forest classifier
rf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Create GridSearchCV object with LeaveOneOut
leave_one_out = LeaveOneOut()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=leave_one_out, scoring='accuracy')

# Perform grid search cross-validation
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean cross-validated score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


ValueError: ignored

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Reshape the input arrays
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Define the model architecture
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc0837c7a00>

In [None]:
# Reshape the test data
num_samples = X_test.shape[0]
num_timesteps = X_test.shape[1]
num_features = X_test.shape[2]
X_test_reshaped = X_test.reshape(num_samples, num_timesteps, num_features)


# Use the trained model to make predictions on the test data
predictions = model.predict(X_test_reshaped)

# Get the predicted labels
predicted_labels = np.argmax(predictions, axis=1)

# Decode the predicted labels using the label_encoder
predicted_categories = label_encoder.inverse_transform(predicted_labels)

# Print the predicted categories and corresponding true categories
print("Predicted Categories\tTrue Categories")
for i in range(len(predicted_categories)):
    print(predicted_categories[i])
    #print(predicted_categories[i], "\t\t\t", label_encoder.inverse_transform([y_test[i]]))

Predicted Categories	True Categories
service extensions
