## HW2: Running the crawler and saving results to excel

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random

keywords = [
    'Python', 'JavaScript', 'Java', 'C#', 'C++', 'Ruby', 'Go', 'Swift', 'TypeScript', 
    'PHP', 'Kotlin', 'Rust', 'R', 'Scala', 'HTML5', 'CSS3', 'React', 'Angular', 'Vue.js', 
    'Node.js', 'Django', 'Flask', 'Ruby on Rails', 'ASP.NET', 'Spring Boot', 'Next.js', 
    'TensorFlow', 'PyTorch', 'Scikit-learn', 'Keras', 'Pandas', 'NumPy', 'Matplotlib', 
    'Seaborn', 'Jupyter Notebook', 'Apache Spark', 'Hadoop', 'Apache Kafka', 'SQL', 
    'NoSQL', 'MongoDB', 'Cassandra', 'Elasticsearch', 'Tableau', 'Power BI', 'Docker', 
    'Kubernetes', 'Jenkins'
]

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
]

# Define a list of other header components to simulate different users
accept_headers = [
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
]

languages = [
    'en-US,en;q=0.5',
    'en-GB,en;q=0.5',
    'en-CA,en;q=0.5'
]

referers = [
    'https://www.google.com/',
    'https://www.bing.com/',
    'https://www.yahoo.com/'
]

def extract_requirements(link, count):
    if count >= 10:
        return "none found"
    try:
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
        data = requests.get(link,headers = headers)
        soup_job_details = BeautifulSoup(data.text, 'html.parser')
        description = soup_job_details.find('div',class_='description__text').find('section').find('div').find_all('ul')
        # Look for all ul's
        requirements = [] 
        for ul in description:
            requirements.extend(ul.find_all('li'))
        requirements = [i.text for i in requirements]

        values = []
        for li in requirements:
            if li.find("years") != -1 or li.find("experience") != -1 or li.find("Experience") != -1:
                values.append(li)
        return values
    except Exception as e:
        return extract_requirements(link, count+1)

links = []
experience = []
i = 0
num_results = 60

word = "tensorflow"

example_search_link = f'https://www.linkedin.com/jobs/search/?currentJobId=3957224827&distance=25&geoId=101620260&keywords={word}&origin=JOBS_HOME_KEYWORD_HISTORY&refresh=true&position=3&pageNum=0'
headers = {
    'User-Agent': random.choice(user_agents),
    'Accept': random.choice(accept_headers),
    'Accept-Language': random.choice(languages),
    'Referer': random.choice(referers),
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Accept-Encoding': 'gzip, deflate, br'
}
data = requests.get(example_search_link,headers = headers)
soup = BeautifulSoup(data.text, 'html.parser')

columns = ['Job Link'] + [f'Job Requirement {i}' for i in range(1, 11)]
df = pd.DataFrame(columns=columns)

link = soup.find('ul',class_='jobs-search__results-list').find_all('li')
for li in link:
    li = li.find('a')['href']
    exp_i = extract_requirements(li, 0)
    if len(exp_i) > 0:
        links.append(li)
        experience.append(exp_i)
        # Create a single row with the job link and requirements
        data = [li] + exp_i + [None] * (10 - len(exp_i))
    
        # Append the data to the DataFrame
        df.loc[i] = data
        if i >= num_results - 1:
            break
        i += 1

df.to_csv('job_requirements.csv', index=False)

## Finding most common words and creating inverted index

In [None]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stop = stopwords.words('english')

all_words = []

for requirements in experience:
    for req in requirements:
        words = re.findall(r'\b\w+\b', req.lower())
        words = [word for word in words if word not in stop]
        all_words.extend(words)

word_counter = Counter(all_words)

most_common_words = word_counter.most_common(15)
print("15 most common words:", most_common_words)

In [None]:
# Initialize the inverted index
inverted_index = {word: [] for word, count in most_common_words}

# Populate the inverted index
for i, requirements in enumerate(experience):
    for req in requirements:
        words = re.findall(r'\b\w+\b', req.lower())
        for word in words:
            if word in inverted_index:
                if i not in inverted_index[word]:
                    inverted_index[word].append(i)

# Print the inverted index
for word in inverted_index:
    print(f"{word}: {inverted_index[word]}")


## Creating TF-IDF calculation

In [None]:
import pandas as pd
from collections import defaultdict, Counter
import math
import re

# Assuming 'experience' is a list of lists of job requirements and 'links' is a list of job links

# Flatten the list of job requirements into a list of strings
documents = [' '.join(requirements) for requirements in experience]

# Step 1: Create a list of all unique words
all_words = set()
for document in documents:
    words = re.findall(r'\b\w+\b', document.lower())
    all_words.update(words)

all_words = sorted(all_words)  # Sort the list of all unique words

# Step 2: Calculate Term Frequency (TF)
def compute_tf(document, all_words):
    tf_dict = {word: 0 for word in all_words}
    words = re.findall(r'\b\w+\b', document.lower())
    total_words = len(words)
    word_counts = Counter(words)
    for word, count in word_counts.items():
        tf_dict[word] = count / total_words
    return tf_dict

tf_list = [compute_tf(doc, all_words) for doc in documents]

# Step 3: Calculate Inverse Document Frequency (IDF)
def compute_idf(documents, all_words):
    N = len(documents)
    idf_dict = {word: 0 for word in all_words}
    for document in documents:
        words = set(re.findall(r'\b\w+\b', document.lower()))
        for word in words:
            if word in idf_dict:
                idf_dict[word] += 1
    for word, count in idf_dict.items():
        idf_dict[word] = math.log(N / (count)) if count != 0 else 0
    return idf_dict

idf_dict = compute_idf(documents, all_words)

# Display TF and IDF
tf_df = pd.DataFrame(tf_list, index=links).fillna(0)
idf_df = pd.DataFrame(list(idf_dict.items()), columns=['Term', 'IDF']).set_index('Term')

print("Term Frequency (TF):")
print(tf_df)
print("\nInverse Document Frequency (IDF):")
print(idf_df)

# Step 4: Calculate TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {word: tf_val * idf[word] for word, tf_val in tf.items()}
    return tfidf

tfidf_list = [compute_tfidf(tf, idf_dict) for tf in tf_list]

# Display TF-IDF
tfidf_df = pd.DataFrame(tfidf_list, index=links).fillna(0)
print("\nTF-IDF:")
print(tfidf_df)


tf_df.to_csv('TermFrequency.csv')
idf_df.to_csv('InverseDocumentFrequency.csv')
tfidf_df.to_csv('TF-IDF.csv')
