**Première partie : Collecte de données avec scraping**

In [None]:
import requests  # Pour faire des requêtes HTTP
from bs4 import BeautifulSoup  # Pour parser le contenu HTML
import csv  # Pour écrire dans un fichier CSV
import codecs  # Pour gérer l'encodage des fichiers

In [None]:
#2
# Créer un fichier CSV et définir l'en-tête
with codecs.open('job_offers.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Job Title", "Company", "Job Type", "Job Description", "Publication Date", "Salary"])


In [None]:
#3
url = 'https://www.simplyhired.com/search?q=data+science'
response = requests.get(url)

In [None]:
#4
soup = BeautifulSoup(response.content, 'html.parser')
# Check if the request was successful
if response.status_code == 200:
    # Check if pagination element exists
    pagination_element = soup.select_one('.pagination')
    if pagination_element:
        num_pages = int(pagination_element.text.strip().split()[-1])
    else:
        num_pages = 1  # Assume only one page if pagination not found
        print("Pagination not found. Assuming only one page.")

    # Boucle pour parcourir toutes les pages
    for page in range(1, num_pages + 1):
        page_url = f'https://www.simplyhired.com/search?q=data+science&pn={page}'
        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # Suite du code pour extraire les données


In [None]:
#5
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
#7
job_cards = soup.findAll('div', class_='SerpJob-jobCard')


In [None]:
#8
for job in job_cards:
    job_title = job.find('h2', class_='jobposting-title').text.strip() if job.find('h2', class_='jobposting-title') else 'N/A'
    company = job.find('span', class_='jobposting-company').text.strip() if job.find('span', class_='jobposting-company') else 'N/A'
    job_type = job.find('span', class_='jobposting-jobtype').text.strip() if job.find('span', class_='jobposting-jobtype') else 'N/A'
    job_description = job.find('p', class_='jobposting-snippet').text.strip() if job.find('p', class_='jobposting-snippet') else 'N/A'
    publication_date = job.find('span', class_='jobposting-postdate').text.strip() if job.find('span', class_='jobposting-postdate') else 'N/A'
    salary = job.find('span', class_='jobposting-salary').text.strip() if job.find('span', class_='jobposting-salary') else 'N/A'



#9
    # Sauvegarde dans le CSV
    with codecs.open('job_offers.csv', 'a', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([job_title, company, job_type, job_description, publication_date, salary])


**Deuxième partie : Créer un chatbot avec python**

In [8]:
# Importer les bibliothèques nécessaires
import nltk
import numpy as np
import random
import string  # pour traiter les chaînes de texte standard en Python

# Question 1 : Télécharger le dataset chabot.txt
with open('/content/Chatbot.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

#Question 2 : Transformer le texte en minuscules
raw_text = raw_text.lower()
# Afficher un extrait du texte transformé
print(raw_text[:1000])


a: since the industrial revolution, the global annual temperature has increased in total by a little more than 1 degree celsius, or about 2 degrees fahrenheit. between 1880—the year that accurate recordkeeping began—and 1980, it rose on average by 0.07 degrees celsius (0.13 degrees fahrenheit) every 10 years. since 1981, however, the rate of increase has more than doubled: for the last 40 years, we’ve seen the global annual temperature rise by 0.18 degrees celsius, or 0.32 degrees fahrenheit, per decade.

the result? a planet that has never been hotter. nine of the 10 warmest years since 1880 have occurred since 2005—and the 5 warmest years on record have all occurred since 2015. climate change deniers have argued that there has been a “pause” or a “slowdown” in rising global temperatures, but numerous studies, including a 2018 paper published in the journal environmental research letters, have disproved this claim. the impacts of global warming are already harming people around the wo

In [7]:
#3
# Télécharger le module punkt pour la tokenization
nltk.download('punkt')
# Télécharger le module wordnet pour le vocabulaire anglais
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#4 :Tokenisation
from nltk.tokenize import sent_tokenize, word_tokenize

# Générer les tokens sous forme de phrases
sentence_tokens = sent_tokenize(raw_text)

# Générer les tokens sous forme de mots
word_tokens = word_tokenize(raw_text)

# Afficher quelques exemples de tokens
print("Exemples de tokens sous forme de phrases :")
print(sentence_tokens[:5])  # Affiche les 5 premières phrases

print("\nExemples de tokens sous forme de mots :")
print(word_tokens[:20])  # Affiche les 20 premiers mots


Exemples de tokens sous forme de phrases :
['a: since the industrial revolution, the global annual temperature has increased in total by a little more than 1 degree celsius, or about 2 degrees fahrenheit.', 'between 1880—the year that accurate recordkeeping began—and 1980, it rose on average by 0.07 degrees celsius (0.13 degrees fahrenheit) every 10 years.', 'since 1981, however, the rate of increase has more than doubled: for the last 40 years, we’ve seen the global annual temperature rise by 0.18 degrees celsius, or 0.32 degrees fahrenheit, per decade.', 'the result?', 'a planet that has never been hotter.']

Exemples de tokens sous forme de mots :
['a', ':', 'since', 'the', 'industrial', 'revolution', ',', 'the', 'global', 'annual', 'temperature', 'has', 'increased', 'in', 'total', 'by', 'a', 'little', 'more', 'than']


In [11]:
#5 :Lemmatisation & Suppression de la ponctuation
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
LemNormalize(raw_text)

['a',
 'since',
 'the',
 'industrial',
 'revolution',
 'the',
 'global',
 'annual',
 'temperature',
 'ha',
 'increased',
 'in',
 'total',
 'by',
 'a',
 'little',
 'more',
 'than',
 '1',
 'degree',
 'celsius',
 'or',
 'about',
 '2',
 'degree',
 'fahrenheit',
 'between',
 '1880—the',
 'year',
 'that',
 'accurate',
 'recordkeeping',
 'began—and',
 '1980',
 'it',
 'rose',
 'on',
 'average',
 'by',
 '007',
 'degree',
 'celsius',
 '013',
 'degree',
 'fahrenheit',
 'every',
 '10',
 'year',
 'since',
 '1981',
 'however',
 'the',
 'rate',
 'of',
 'increase',
 'ha',
 'more',
 'than',
 'doubled',
 'for',
 'the',
 'last',
 '40',
 'year',
 'we',
 '’',
 've',
 'seen',
 'the',
 'global',
 'annual',
 'temperature',
 'rise',
 'by',
 '018',
 'degree',
 'celsius',
 'or',
 '032',
 'degree',
 'fahrenheit',
 'per',
 'decade',
 'the',
 'result',
 'a',
 'planet',
 'that',
 'ha',
 'never',
 'been',
 'hotter',
 'nine',
 'of',
 'the',
 '10',
 'warmest',
 'year',
 'since',
 '1880',
 'have',
 'occurred',
 'since',

In [16]:
#6
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
  for word in sentence.split():
   if word.lower() in GREETING_INPUTS:
     return random.choice(GREETING_RESPONSES)

greeting('hi')

'hey'

In [17]:
#7
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [24]:
# Fonction pour trouver la réponse la plus similaire
def response(user_response):
    robo_response = ''
    # Ajouter la réponse utilisateur aux tokens de phrase
    sentence_tokens.append(user_response)

    # Initialiser TfidfVectorizer
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

    # Appliquer TfidfVectorizer
    tfidf = TfidfVec.fit_transform(sentence_tokens)

    # Calculer la similarité cosinus
    vals = cosine_similarity(tfidf[-1], tfidf)

    # Trouver l'index de la réponse la plus similaire
    idx = vals.argsort()[0][-2]

    # Aplatir la matrice des valeurs
    flat = vals.flatten()
    flat.sort()

    # Récupérer la deuxième valeur la plus élevée de la similarité cosinus
    req_tfidf = flat[-2]

    if req_tfidf == 0:
        robo_response = "I am sorry! I don't understand you."
    else:
        robo_response = sentence_tokens[idx]

    # Retirer la réponse utilisateur ajoutée aux tokens de phrase
    sentence_tokens.pop(-1)

    return robo_response

# Exemple d'utilisation
response('hi')



"I am sorry! I don't understand you."

In [26]:
# Main loop
flag = True
print("ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag):
    user_response = input()
    user_response = user_response.lower()
    if(user_response != 'bye'):
        if(user_response == 'thanks' or user_response == 'thank you'):
            flag = False
            print("ROBO: You are welcome..")
        else:
            if(greeting(user_response) is not None):
                print("ROBO: " + greeting(user_response))
            else:
                print("ROBO: ", end="")
                print(response(user_response))
    else:
        flag = False
        print("ROBO: Bye! take care..")

ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!
hello
ROBO: hi
ca va




ROBO: I am sorry! I don't understand you.
why
ROBO: I am sorry! I don't understand you.
thanks
ROBO: You are welcome..
