In [None]:
# INSTALLING LIBRARIES


%pip install newspaper3k
%pip install lxml_html_clean
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# !pip install transformers torch

%pip install rake_nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
%pip install sumy
%pip install google-api-python-client

%pip install selenium
%pip install webdriver-manager
%pip install bs4
%pip install pandas
%pip install matplotlib
%pip install wordcloud
%pip install BeautifulSoup

%pip install fastapi uvicorn pandas spacy pdfplumber python-docx
%pip install python-multipart
%pip install pymupdf
%pip install PyPDF2

In [None]:
# SCRAPPING ARTICLES/BLOGS USING NLP SCRAPPER



import pandas as pd
from apiclient.discovery import build
from newspaper import Article
from datetime import datetime, timedelta
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from urllib.parse import urlparse

# Google API Key and Custom Search Engine ID (Replace these with your actual credentials)
api_key = "AIzaSyAdcQAuDk5NU36FrPnz7wMmAW6C7nvxOCQ"  #AIzaSyDpsF0BdJ2OBEFnwL5nD_CTHewM5PpDfUA  #AIzaSyAdcQAuDk5NU36FrPnz7wMmAW6C7nvxOCQ
cse_id = "d79ec70642d91487e"   # a7d06f59dee40471a

# Load existing data
try:
    existing_df = pd.read_excel('ArticlesFinal.xlsx')
except FileNotFoundError:
    existing_df = pd.DataFrame(columns=['Date', 'Title', 'Author', 'Publication Date', 'Article Text', 'Link', 'Source URL', 'Keywords', 'Job Market Insights'])

# Initialize a new DataFrame
df = pd.DataFrame(columns=['Date', 'Title', 'Author', 'Publication Date', 'Article Text', 'Link', 'Source URL', 'Keywords', 'Job Market Insights'])

# Google Custom Search setup
query = 'future jobs market in computer science'     # emerging technologies in computing
resource = build("customsearch", 'v1', developerKey=api_key).cse()

def extract_keywords(text):
    """Extract common keywords from article text."""
    tokens = word_tokenize(text.lower())
    keywords = [word for word in tokens if word.isalpha()]
    return FreqDist(keywords).most_common(10)

def get_article_info(url, date):
    """Fetch and parse article metadata and content."""
    try:
        article = Article(url, timeout=10)
        article.download()
        article.parse()

        title = article.title or "Unknown Title"
        authors = ", ".join(article.authors) if article.authors else "Unknown Authors"
        publish_date = article.publish_date.replace(tzinfo=None) if article.publish_date else None
        text = article.text or None

        # Extract keywords and identify trends
        keywords = extract_keywords(text) if text else []
        market_insights = "Tech industry growth" if "tech" in text.lower() else "General job trends"

        # Add the data to the DataFrame
        df.loc[len(df)] = [date, title, authors, publish_date, text, url, urlparse(url).netloc, keywords, market_insights]
        print(url, "data added")
    except Exception as e:
        print(f"Error processing URL: {url}")
        print(e)

def get_articles_by_date():
    """Fetch articles within a specific date range using Google Custom Search."""
    # Define date range (last 30 days)
    today = datetime.today()
    last_month = today - timedelta(days=6000)
    sdate = last_month.strftime('%y%m%d')
    edate = today.strftime('%y%m%d')
    date_range = f"{sdate}:{edate}"
    urls = []

    # Fetch articles (Google Custom Search returns 10 results per page)
    for i in range(1, 100, 10):  # Pagination: up to 100 results
        result = resource.list(q=query, cx=cse_id, sort=f"date:r:{date_range}", start=i).execute()
        urls += result.get('items', [])

    print(f"Total results fetched: {len(urls)}")

    # Process each URL
    for item in tqdm(urls):
        url = item.get('link')
        # if url and url not in existing_df['Link'].values and url not in df['Link'].values:
        get_article_info(url, today.strftime('%Y-%m-%d'))
        # else:
            # print("URL already processed:", url)

# Fetch articles
get_articles_by_date()

# Combine new and existing data
final_df = pd.concat([existing_df, df], ignore_index=True)

# Convert publication date to datetime and sort
final_df['Publication Date'] = pd.to_datetime(final_df['Publication Date'], errors='coerce')
final_df = final_df.sort_values(by=['Publication Date'], ascending=True)

# Save to Excel
final_df.to_csv('ArticlesNew.csv', index=False)
print("Data saved to ArticlesNew.csv")

In [None]:
# GENERATING INSIGHTS ON THE BASIS OF SCRAPPED DATA


import re
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import pandas as pd
from textblob import TextBlob
from rake_nltk import Rake


nlp = spacy.load("en_core_web_sm")
rake1 = Rake()


existing_df = pd.read_excel('ArticlesFinal.xlsx')

# Define positive and negative keywords
POSITIVE_WORDS = ["growth", "increase", "rise", "demand", "surge", "expand", "boost", "upward", "accelerate", "improve"]
NEGATIVE_WORDS = ["decline", "fall", "decrease", "drop", "slowdown", "recession", "cut", "downward", "stagnant"]

# Load spaCy's English model for NER
nlp = spacy.load("en_core_web_sm")

def analyze_job_trend(article_text):
    # Tokenize the article into sentences
    sentences = sent_tokenize(article_text.lower())
    stop_words = set(stopwords.words('english'))

    results = defaultdict(str)

    # Apply named entity recognition (NER) using spaCy
    doc = nlp(article_text)

    # Extract job roles and technologies (assumed to be proper nouns or certain noun phrases)
    job_roles = set([ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "WORK_OF_ART"]])
    technologies = set([token.text.lower() for token in doc if token.pos_ == "PROPN" and token.text not in stop_words])

    print(f"Extracted Job Roles: {job_roles}")
    print(f"Extracted Technologies: {technologies}")


    # Initialize a dictionary to store trend analysis
    trend_analysis = defaultdict(lambda: {"positive": 0, "negative": 0, "mentions": 0})

    # Analyze each sentence to count positive and negative mentions of job roles/technologies
    for sentence in sentences:
        words = word_tokenize(sentence)
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

        for word in filtered_words:
            if word in POSITIVE_WORDS:
                trend_analysis[word]["positive"] += 1
            elif word in NEGATIVE_WORDS:
                trend_analysis[word]["negative"] += 1
            if word in job_roles or word in technologies:
                trend_analysis[word]["mentions"] += 1

    print(f"Trend Analysis: {trend_analysis}")  # Debugging line to show trend analysis

    # Analyze trends for each extracted job role or technology
    for entity, counts in trend_analysis.items():
        positive_count = counts["positive"]
        negative_count = counts["negative"]
        mentions = counts["mentions"]

        # Apply weighted analysis (giving more weight to negative words)
        weighted_positive = positive_count
        weighted_negative = negative_count * 1.2  # Negative words have a 20% higher weight

        if mentions > 0:
            if weighted_positive > weighted_negative:
                results[entity] = f"Rise: Positive keywords ({positive_count}) outweigh negative keywords ({negative_count}) for '{entity}'."
            elif weighted_negative > weighted_positive:
                results[entity] = f"Fall: Negative keywords ({negative_count}) outweigh positive keywords ({positive_count}) for '{entity}'."
            else:
                results[entity] = f"Unclear: Positive and negative keywords are balanced for '{entity}'."

        else:
            results[entity] = f"No Mention: The job role/technology '{entity}' was not mentioned in the article."

    # Format the results to match the desired output
    output = []
    for entity, analysis in results.items():
        if "Rise" in analysis:
            output.append(f"{entity}, rise")
        elif "Fall" in analysis:
            output.append(f"{entity}, fall")

    return "\n".join(output)



# Example usage
if __name__ == "__main__":

    article_text = (existing_df['Article Text'].iloc[56])
    # article_text = "From the invention of the wheel to advancements in metallurgy, history has been defined by transformative innovations that have reshaped how we live and work. Today, we stand at the threshold of another paradigm shift driven by emerging technologies such as Artificial Intelligence (AI), Blockchain, the Internet of Things (IoT), and Machine Learning (ML), all of which are powering the Web 3.0 revolution."


    # Analyze trends dynamically
    result = analyze_job_trend(article_text)
    print(result)




# new code


def extract_context_with_trends(text, words, trend_words):

  results = []
  for word in words:
    for i, w in enumerate(text.split()):
      if w == word:
        start_index = max(0, i - 15)
        end_index = min(len(text.split()), i + 16)
        context_before = " ".join(text.split()[start_index:i])
        context_after = " ".join(text.split()[i+1:end_index])

        trend = None
        for trend_word in trend_words:
          if trend_word in context_before or trend_word in context_after:
            trend = trend_word
            break

        results.append((word, context_before, context_after, trend))
  return results

# Example usage:
text = (existing_df['Article Text'].iloc[56]).lower()
final_text = ""
keyword_text = ""
sentiment_keyword = []


# text = "From the invention of the wheel to advancements in metallurgy, history has been defined by transformative innovations that have reshaped how we live and work. Today, we stand at the threshold of another paradigm shift driven by emerging technologies such as Artificial Intelligence (AI), Blockchain, the Internet of Things (IoT), and Machine Learning (ML), all of which are powering the Web 3.0 revolution."
# text = text.lower()

sentences = sent_tokenize(article_text.lower())
stop_words = set(stopwords.words('english'))
results = defaultdict(str)
nlp = spacy.load("en_core_web_sm")
    # Apply named entity recognition (NER) using spaCy
doc = nlp(article_text)

    # Extract job roles and technologies (assumed to be proper nouns or certain noun phrases)
job_roles = set([ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "WORK_OF_ART"]])
technologies = set([token.text.lower() for token in doc if token.pos_ == "PROPN" and token.text not in stop_words])
trend_words = [r'growth in', r'incline in', r'taking over', r'increase in demand for', r'advancements in', r'emerging', r'increase', r'rise', r'growth', r'demand', r'higher', r'more', r'expand', r'boost', r'opportunity', r'gain', r'decrease', r'fall', r'decline', r'drop', r'lower', r'less', r'reduce', r'cut', r'struggle', r'lack', r'uptick', r'surge', r'boom', r'prosper', r'flourish', r'thrive', r'soar', r'escalate', r'accelerate', r'expansion', r'diversification', r'innovation', r'disruption', r'downturn', r'recession', r'contraction', r'shrinkage', r'retrenchment', r'layoff', r'redundancy', r'stagnation', r'job market', r'labor force', r'unemployment rate', r'hiring spree', r'skill shortage', r'remote work', r'gig economy', r'artificial intelligence', r'automation']

context_list = extract_context_with_trends(text, technologies, trend_words)

for word, before, after, trend in context_list:

  sentiment = TextBlob(before + " " + after).sentiment.polarity
  sentiment_label = "Positive" if sentiment > 0 else "Negative" if sentiment < 0 else "Neutral"
  print(f"Word: {word}")
  rake1.extract_keywords_from_text(after)
  keywords1 = rake1.get_ranked_phrases()
  rake2 = Rake()
  rake2.extract_keywords_from_text(before)
  keywords2 = rake2.get_ranked_phrases()

  print(f"Match: {word}")
  # print(f"Before: {before}")
  # print("Extracted Keywords (before) by RAKE:")
  # print('Before:- ')
  for kw in keywords2:
    # print("-", kw)
    match = re.search(r'(\d+)', kw)
    if match:
        percentage = int(match.group(1))
        print("-", percentage)
    else:
        percentage = 0
  # print(f"After: {after}")
  # print("Extracted Keywords (after) by RAKE:")
  # print('After:- ')
  for kw in keywords1:
    # print("-", kw)
    match = re.search(r'(\d+)', kw)
    if match:
        percentage = int(match.group(1))
        print("-", percentage)
    else:
        percentage = 0
  print(f"Trend: {trend}")

  def custom_sentiment_analysis(text):

    positive_keywords = [
      r'growth in', r'incline in', r'taking over', r'increase in demand for', r'advancements in',
      r'emerging', r'increase', r'rise', r'growth', r'demand', r'higher', r'more', r'expand',
      r'boost', r'opportunity', r'gain', r'uptick', r'surge', r'boom', r'prosper', r'flourish',
      r'thrive', r'soar', r'escalate', r'accelerate', r'expansion', r'diversification', r'innovation'
  ]
    negative_keywords = [
      r'decrease', r'fall', r'decline', r'drop', r'lower', r'less', r'reduce', r'cut', r'struggle',
      r'lack', r'downturn', r'recession', r'contraction', r'shrinkage', r'retrenchment', r'layoff',
      r'redundancy', r'stagnation', r'unemployment rate'
  ]
    text = text.lower()
    words = text.split()

    positive_count = 0
    negative_count = 0

    for word in words:
      if word in positive_keywords:
        positive_count += 1
      elif word in negative_keywords:
        negative_count += 1

    if positive_count > negative_count:
      return "Positive"
    elif negative_count > positive_count:
      return "Negative"
    else:
      return "Neutral"

  text = before + " " + word + " " + after
  keyword_text = keyword_text + " " + word
  final_text = final_text + " " + text
  sentiment = custom_sentiment_analysis(text)
  print(sentiment)

  sentiment_keyword.append((word, sentiment, percentage))

  print("-" * 20)

print(final_text)
print(keyword_text)

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def summarize_text(text, num_sentences=5):
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
  summarizer = LexRankSummarizer()
  summary = summarizer(parser.document, num_sentences)
  return [str(sentence) for sentence in summary]

# Example usage:
summary = summarize_text(final_text, 3)
print(summary)


def find_context(text, keywords):
  words = text.split()
  results = []

  for keyword in keywords:
    for i, word in enumerate(words):
      if word == keyword:
        start_index = max(0, i - 15)
        end_index = min(len(words), i + 16)
        context_before = " ".join(words[start_index:i])
        context_after = " ".join(words[i+1:end_index])
        results.append((keyword, context_before, context_after))

  return results

# Example usage:
# text = "This is a sample text to test the function. Harvard is a prestigious university. AI is changing the world. ChatGPT is a powerful language model. Harvard Business Review is a renowned publication."
keywords = [r'growth in', r'incline in', r'taking over', r'dropped', r'increase in demand for', r'advancements in', r'emerging', r'increase', r'rise', r'growth', r'demand', r'higher', r'more', r'expand', r'boost', r'opportunity', r'gain', r'decrease', r'fall', r'decline', r'drop', r'lower', r'less', r'reduce', r'cut', r'struggle', r'lack', r'uptick', r'surge', r'boom', r'prosper', r'flourish', r'thrive', r'soar', r'escalate', r'accelerate', r'expansion', r'diversification', r'innovation', r'disruption', r'downturn', r'recession', r'contraction', r'shrinkage', r'retrenchment', r'layoff', r'redundancy', r'stagnation', r'job market', r'labor force', r'unemployment rate', r'hiring spree', r'skill shortage', r'remote work', r'gig economy', r'artificial intelligence', r'automation', r'expected', r'grow', r'projected', r'fall']

context_list = find_context(str(summary), keywords)

for word, before, after in context_list:
  print(f"Word: {word}")
  print(f"Before: {before}")
  print(f"After: {after}")
  print("-" * 20)


from collections import defaultdict

def summarize_sentiment_with_percentage(sentiment_keyword):
    # Dictionary to store sentiment scores and percentages
    sentiment_scores = defaultdict(lambda: {"score": 0, "percentage": 0})

    # Update scores and percentages based on input data
    for word, sentiment, percentage in sentiment_keyword:
        if sentiment == "Positive":
            sentiment_scores[word]["score"] += 1
        elif sentiment == "Negative":
            sentiment_scores[word]["score"] -= 1
        elif sentiment == "Neutral":
            sentiment_scores[word]["score"] += 0  # Neutral has no effect

        sentiment_scores[word]["percentage"] += percentage

    # Calculate final percentages and prepare the output
    results = {}
    for word, data in sentiment_scores.items():
        total_percentage = data["percentage"]
        sentiment_score = data["score"]

        if sentiment_score == 0:
            final_percentage = 0  # Neutral case
        else:
            final_percentage = (total_percentage * sentiment_score) / abs(sentiment_score)

        # Check if percentage exceeds 100
        if abs(final_percentage) > 100:
            final_percentage = 0

        results[word] = {
            "sentiment_score": sentiment_score,
            "final_percentage": final_percentage
        }

    return results

# Calculate sentiment summary with percentages
sentiment_summary = summarize_sentiment_with_percentage(sentiment_keyword)

# Print results
for word, data in sentiment_summary.items():
    print(f"Word: {word}, Sentiment Score: {data['sentiment_score']}, Final Percentage: {data['final_percentage']:.2f}%")


In [None]:
# GENERATING ROADMAPS FOR A SELECTIVE ROLE 



import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter




df = pd.read_csv('job_role_and_skills_required.csv')

def calculate_skill_frequencies(df):
    all_skills = df['job_skills'].str.split(', ').explode()  # Flatten all skills
    return Counter(all_skills)

def plot_roadmap_for_job_title_sorted(job_title, df):
    skill_frequencies = calculate_skill_frequencies(df)
    row = df[df['job_title'] == job_title]
    if row.empty:
        print(f"No data found for job title: {job_title}")
        return

    skills = row.iloc[0]['job_skills'].split(', ')
    skills_sorted = sorted(skills, key=lambda skill: skill_frequencies.get(skill, 0), reverse=True)

    print(f"Frequencies: {skill_frequencies}")
    print(f"Skills sorted: {skills_sorted}")

    G = nx.DiGraph()
    for i in range(len(skills_sorted) - 1):
        G.add_edge(skills_sorted[i], skills_sorted[i + 1])

    plt.figure(figsize=(20, 8))
    pos = nx.spring_layout(G, seed=42)  # Position nodes
    nx.draw(
        G, pos, with_labels=True, node_color="skyblue", edge_color="gray",
        node_size=3000, font_size=10, font_weight="bold", arrowsize=20
    )
    start_node = skills_sorted[0]
    end_node = skills_sorted[-1]
    nx.draw_networkx_nodes(G, pos, nodelist=[start_node], node_color="green", node_size=3500)
    nx.draw_networkx_nodes(G, pos, nodelist=[end_node], node_color="red", node_size=3500)

    # Add labels for start and end points
    plt.text(pos[start_node][0], pos[start_node][1] + 0.1, "Start", fontsize=12, color="green", fontweight="bold")
    plt.text(pos[end_node][0], pos[end_node][1] - 0.1, "End", fontsize=12, color="red", fontweight="bold")

    plt.title(f"Roadmap for {job_title} (Skills Sorted by Frequency)", fontsize=14)
    plt.show()

# Example: Plot roadmap for "Senior Machine Learning Engineer"
plot_roadmap_for_job_title_sorted("Machine Learning Infrastructure Engineer", df)

In [None]:
# YOUTUBE VIDEOS RECOMMENDATION FOR LEARNING ANY PARTICULAR SKILL FROM THE ROADMAP OF SKILLS





from apiclient.discovery import build #pip install google-api-python-client
from apiclient.errors import HttpError #pip install google-api-python-client
import pandas as pd #pip install pandas
import oauth2client.tools as oauthtools
from importlib import reload
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of
# https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.

DEVELOPER_KEY = "AIzaSyBEGN6VevXeubmxYO5vJXDQSQIpEp1j-bc"   # AIzaSyBEGN6VevXeubmxYO5vJXDQSQIpEp1j-bc
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"


# Define the YouTube search function
def youtube_search(words):
    q = words
    max_results = 50
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

    search_response = youtube.search().list(q=q, type="video", part="id,snippet", maxResults=max_results).execute()

    videos = {}
    for search_result in search_response.get("items", []):
        if search_result["id"]["kind"] == "youtube#video":
            video_id = search_result["id"]["videoId"]
            video_title = search_result["snippet"]["title"]
            # Add video ID and title to the dictionary
            videos[video_id] = video_title

    # Get video statistics for the collected videos
    s = ','.join(videos.keys())
    videos_list_response = youtube.videos().list(id=s, part='id,statistics').execute()

    res = []
    for i in videos_list_response['items']:
        video_id = i['id']
        video_title = videos[video_id]
        video_link = f"https://www.youtube.com/watch?v={video_id}"  # Construct video link

        # Create a dictionary for the video details
        temp_res = {
            'v_id': video_id,
            'v_title': video_title,
            'v_link': video_link
        }
        temp_res.update(i['statistics'])  # Add video statistics
        res.append(temp_res)

    # Convert results to a DataFrame and save to a CSV file
    df = pd.DataFrame.from_dict(res)
    df.to_csv('YTvideos.csv', mode='a', encoding='utf-8', index=False)

# Define search terms
terms = ["python full course tutorial"]
for term in terms:
    youtube_search(term)



import pandas as pd

df = pd.read_csv('YTvideos.csv', encoding='utf-8')

df1 = df.drop_duplicates(['v_title'])

# df1.head()

import pandas as pd
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv('YTvideos.csv')

df['viewCount'] = pd.to_numeric(df['viewCount'], errors='coerce')
df['likeCount'] = pd.to_numeric(df['likeCount'], errors='coerce')
df['commentCount'] = pd.to_numeric(df['commentCount'], errors='coerce')

df.dropna(subset=['viewCount', 'likeCount', 'commentCount'], inplace=True)

scaler = MinMaxScaler()
df[['viewCount_norm', 'likeCount_norm', 'commentCount_norm']] = scaler.fit_transform(
    df[['viewCount', 'likeCount', 'commentCount']]
)

df['composite_score'] = (
    0.5 * df['viewCount_norm'] +  # Weight for viewCount
    0.3 * df['likeCount_norm'] +  # Weight for likeCount
    0.2 * df['commentCount_norm']  # Weight for commentCount
)

top_videos = df.nlargest(4, 'composite_score')

for index, video in top_videos.iterrows():
    print(f"Rank: {index + 1}")
    print(f"Title: {video['v_title']}")
    print(f"Link: {video['v_link']}")
    print(f"Composite Score: {video['composite_score']:.4f}\n")

In [None]:
# COURSERA COURSES RECOMMENDATION ON LEARNING ANY SKILL RECOMMENDED FROM ROADMAP



from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import time
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Initialize Chrome options
options = Options()

# Set the correct path for Chrome if needed (not required in most cases)
chrome_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"
options.binary_location = chrome_path

# Initialize Chrome WebDriver properly
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Naukri.com
driver.get("https://www.coursera.org/")  # Print page title to verify success

# Close driver after use

# ENTER THE ROLE TO SEARCH FOR


button = driver.find_element(By.XPATH, '//*[@id="rendered-content"]/div/header/div/div/div/div[1]/div/div/div/div/div[2]/span[1]/button').click()

input_search = driver.find_element(By.XPATH, '//*[@id="search-autocomplete-input"]')

input_search.send_keys('Web Development')

input_search.send_keys(Keys.ENTER) 

soup = BeautifulSoup(driver.page_source, 'html.parser')

posting = soup.find_all('div', class_ = 'cds-ProductCard-base cds-ProductCard-list css-2kdvnl')

len(posting)

# CODE FOR 1 SINGLE PAGE 



import pandas as pd
from bs4 import BeautifulSoup

df = pd.DataFrame(columns=['Free_Paid', 'Link', 'Name', 'From', 'SkillsGain', 'rating', 'reviews', 'ExpType_Time'])
data_list = []  # Store data in a list first (efficient way)

for post in posting:
    # Extract Job Title and Link
    row1_div = post.find('div', class_='cds-ProductCard-statusTags cds-ProductCard-statusTagsSimple')  # Removed extra space in class name
    if row1_div:
        span_tag = row1_div.find('span')  # Find the <a> tag inside row1
        free_paid = span_tag.text.strip() if span_tag else None  # Extract text (job title)
        # href = span_tag['href'] if a_tag and 'href' in a_tag.attrs else None  # Extract href link
    else:
        free_paid = 'Paid'  # Default if div not found
        
    # link_element = driver.find_element(By.XPATH, '//a[contains(@class, "cds-CommonCard-titleLink")]')
    # link = link_element.get_attribute("href")
    
    link = post.find('a', class_ = 'cds-CommonCard-titleLink').get('href')
    full_link = f"https://www.coursera.org{link}"
    
    name = post.find('h3', class_ = 'cds-CommonCard-title css-6ecy9b').text

    # Extract Company Name
    fromC = post.find('p', class_='cds-ProductCard-partnerNames css-vac8rf').text  # Removed extra space in class name
    # if fromC:
    #     a_tag = fromC.find('a')  # Find <a> inside div (ignoring span class)
    #     fromC = a_tag.text.strip() if a_tag else None  # Extract text from <a>
    # else:
    #     fromC = None  # If div is not found

    # Extract Experience, Salary, Location
    SkillsGain = post.find('div', class_='cds-ProductCard-body').text.strip()
    skills_list = [skill.strip() for skill in SkillsGain.split(',')]  # If skills are comma-separated

    
    # rating_element = post.find('div', class_='cds-RatingStat-sizeLabel css-1i7bybc').text
    
    rating_element = post.find('div', class_='cds-CommonCard-ratings')
    text = rating_element.get_text(separator=' ', strip=True)
    parts = text.split('·')  # The "·" character separates rating and reviews
    rating = parts[0].strip()  # "4.7 Rating, 4.7 out of 5 stars"
    reviews = parts[1].strip() if len(parts) > 1 else None

    # Extract Skills
    # expTime = post.find('div', class_='cds-CommonCard-metadata').text  # Removed extra space in class name
    
    expTime = post.find('div', class_='cds-CommonCard-metadata').text.strip()
    expTime_list = [expTime.strip() for expTi in expTime.split(',')] 

    # Append data to the list
    data_list.append({
        'Free_Paid': free_paid,
        'Link': full_link,
        'Name': name,
        'From': fromC,
        'SkillsGain': skills_list,  # Fixed issue (location now assigned correctly)
        'rating': rating,
        'reviews': reviews,  # Fixed issue (was duplicated before)
        'ExpType_Time': expTime_list
    })

# Convert to DataFrame
df = pd.concat([df, pd.DataFrame(data_list)], ignore_index=True)

# Save to CSV
df.to_csv("CourseraSD.csv", index=False, encoding="utf-8")

print("Data extraction completed and saved to CSV.")



In [None]:
# SCRAPING NAUKRI.COM FOR JOB POSTINGS THAT CAN BE USEFUL FOR USERS 



from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import time
import re


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Initialize Chrome options
options = Options()

# Set the correct path for Chrome if needed (not required in most cases)
chrome_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"
options.binary_location = chrome_path

# Initialize Chrome WebDriver properly
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Naukri.com
driver.get("https://www.naukri.com/")  # Print page title to verify success

# Close driver after use

# ENTER THE ROLE TO SEARCH FOR


input_search = driver.find_element(By.XPATH, '//*[@id="root"]/div[7]/div/div/div[1]/div/div/div[1]/div[1]/div/input')

input_search.send_keys('App Development')

button = driver.find_element(By.XPATH, '//*[@id="root"]/div[7]/div/div/div[6]').click()

soup = BeautifulSoup(driver.page_source, 'html.parser')

posting = soup.find_all('div', class_ = 'srp-jobtuple-wrapper')

len(posting)


# # CODE FOR 5 PAGES 




import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

# # Initialize WebDriver
# driver = webdriver.Chrome()  # Ensure you have the correct driver installed

df = pd.DataFrame(columns=['Link', 'Name', 'CompName', 'loc', 'sal', 'exp', 'Skills'])
data_list = []  # Store data in a list first (efficient way)

# Loop through 5 pages
for page in range(5):
    time.sleep(2)  # Let the page load

    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    posting = soup.find_all('div', class_ = 'srp-jobtuple-wrapper')
    
    for post in posting:
        # Extract Job Title and Link
        row1_div = post.find('div', class_='row1')
        if row1_div:
            a_tag = row1_div.find('a')
            title = a_tag.text.strip() if a_tag else None
            href = a_tag['href'] if a_tag and 'href' in a_tag.attrs else None
        else:
            title, href = None, None

        # Extract Company Name
        Compname = post.find('div', class_='row2')
        if Compname:
            a_tag = Compname.find('a')
            Compname = a_tag.text.strip() if a_tag else None
        else:
            Compname = None

        # Extract Experience, Salary, Location
        row3_div = post.find('div', class_='row3')
        experience, salary, location = None, None, None

        if row3_div:
            spans = row3_div.find_all('span')

            for span in spans:
                title_attr = span.get('title', '').strip()

                if 'Yrs' in title_attr:
                    experience = title_attr
                elif 'disclosed' in title_attr or 'PA' in title_attr:
                    salary = title_attr
                else:
                    location = title_attr

        # Extract Skills
        row5_div = post.find('div', class_='row5')
        skills = [li.text.strip() for li in row5_div.find_all('li')] if row5_div else []

        # Append data to the list
        data_list.append({
            'Link': href,
            'Name': title,
            'CompName': Compname,
            'loc': location,
            'sal': salary,
            'exp': experience,
            'Skills': skills
        })

    # Try to click the "Next" button to go to the next page
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, '#lastCompMark > a:nth-child(4)')
        next_button.click()
        time.sleep(2)  # Allow time for the next page to load
    except:
        print("No Next button found or reached last page.")
        break  # Stop if no next button

# Convert to DataFrame
df = pd.concat([df, pd.DataFrame(data_list)], ignore_index=True)

# Save to CSV
df.to_csv("naukri_jobsSDENew.csv", index=False, encoding="utf-8")

print("Data extraction completed and saved to CSV.")

# # Close the driver
# driver.quit()



In [None]:
# GENERATING PERCENT MATCHING THAT WILL BE USEFUL FOR USER TO APPLY FOR JOBS WHERE HE/SHE HAS MORE CHANCES TO BE SELECTED IN



import pandas as pd
import PyPDF2
import re
import ast  # To parse list-like structures

# Function to extract text from PDF resume
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + " "
    return text.strip()

# Function to extract skills from resume text
def extract_skills(text):
    skill_keywords = {
        "python", "java", "sql", "machine learning", "deep learning",
        "pytorch", "javascript", "node.js", "react.js", "hadoop",
        "spark", "tableau", "power bi", "tensorflow", "nlp", "aws",
        "pandas", "numpy", "matplotlib", "seaborn", "data science", "mongodb"
    }
    words = set(re.findall(r'\b[a-zA-Z0-9+.-]+\b', text.lower()))
    return skill_keywords.intersection(words)

# Extract skills from resume
resume_text = extract_text_from_pdf("/software-engineer-resume-example.pdf")
resume_skills = extract_skills(resume_text)

print("Extracted Resume Skills:", resume_skills)  # Debugging output

# Read job CSV
job_df = pd.read_csv("/naukri_jobs_app dev.csv")

# Function to clean and extract job skills
def extract_job_skills(skill_str):
    if not isinstance(skill_str, str) or not skill_str.strip():
        return set()

    try:
        skill_data = ast.literal_eval(skill_str)  # Convert string representation of lists/sets into actual sets
        if isinstance(skill_data, list):
            skill_data = set(skill_data)
        elif not isinstance(skill_data, set):
            skill_data = {skill_data}
        return set(map(str.lower, map(str.strip, skill_data)))  # Ensure lowercase & remove spaces
    except (SyntaxError, ValueError):
        return set()  # Return empty set if parsing fails

# Apply skill extraction to CSV
job_df["Skills"] = job_df["Skills"].apply(extract_job_skills)

# Debugging: Print cleaned job skills
print("First few job skills from CSV:\n", job_df["Skills"].head())

# Function to calculate match percentage
def calculate_match(job_skills):
    if not job_skills:
        return 0

    match_count = len(resume_skills.intersection(job_skills))
    return (match_count / len(job_skills)) * 100 if job_skills else 0

# Apply function to DataFrame
job_df["Match Percentage"] = job_df["Skills"].apply(calculate_match)

# Print results
for _, row in job_df.iterrows():
    print(f"{row['CompName']} ---- {row['Name']}: {row['Match Percentage']:.2f}% match")
import pandas as pd
import PyPDF2
import re
import ast  # To parse list-like structures

# Function to extract text from PDF resume
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + " "
    return text.strip()

# Function to extract skills from resume text
def extract_skills(text):
    skill_keywords = {
        "python", "java", "sql", "machine learning", "deep learning",
        "pytorch", "javascript", "node.js", "react.js", "hadoop",
        "spark", "tableau", "power bi", "tensorflow", "nlp", "aws",
        "pandas", "numpy", "matplotlib", "seaborn", "data science", "mongodb"
    }
    words = set(re.findall(r'\b[a-zA-Z0-9+.-]+\b', text.lower()))
    return skill_keywords.intersection(words)

# Extract skills from resume
resume_text = extract_text_from_pdf("/software-engineer-resume-example.pdf")
resume_skills = extract_skills(resume_text)

print("Extracted Resume Skills:", resume_skills)  # Debugging output

# Read job CSV
job_df = pd.read_csv("/naukri_jobs_app dev.csv")

# Function to clean and extract job skills
def extract_job_skills(skill_str):
    if not isinstance(skill_str, str) or not skill_str.strip():
        return set()

    try:
        skill_data = ast.literal_eval(skill_str)  # Convert string representation of lists/sets into actual sets
        if isinstance(skill_data, list):
            skill_data = set(skill_data)
        elif not isinstance(skill_data, set):
            skill_data = {skill_data}
        return set(map(str.lower, map(str.strip, skill_data)))  # Ensure lowercase & remove spaces
    except (SyntaxError, ValueError):
        return set()  # Return empty set if parsing fails

# Apply skill extraction to CSV
job_df["Skills"] = job_df["Skills"].apply(extract_job_skills)

# Debugging: Print cleaned job skills
print("First few job skills from CSV:\n", job_df["Skills"].head())

# Function to calculate match percentage
def calculate_match(job_skills):
    if not job_skills:
        return 0

    match_count = len(resume_skills.intersection(job_skills))
    return (match_count / len(job_skills)) * 100 if job_skills else 0

# Apply function to DataFrame
job_df["Match Percentage"] = job_df["Skills"].apply(calculate_match)

# Print results
for _, row in job_df.iterrows():
    print(f"{row['CompName']} ---- {row['Name']}: {row['Match Percentage']:.2f}% match")
