1 - Pour commencer

Questions

Quelles sont les sources de données textuelles pour le NLP ?
    Les données textuelles nécessaire pour le NLP peuvent se retrouver sur internet, via des bases de données textuelle existante (Wikipedia, Forum etc).

Pourquoi la collecte de données est-elle essentielle pour le NLP ?
    Les méthodes de NLP nécéssite de disposer d'une importante base de donnée, qui contient des éléments pertinent pour la finalité du NLP.

# Extraction de données à partir d'une API 
    API : NewsAPI
### 1 - Créer un compte API et récupérer une clé API (d23f0beac38543e58490208ac45957ae)
### 2 - Effectuer une requête pour collecter des articles sur un thème donné.
### 3 - Explorer les résultats obtenus, en identifiant sur les titres et les descriptions des articles.
### 4 - Sauvegarder les résulats dans un fichier CSV pour une utilisation ultérieure.

In [None]:
import json
import csv
from requests import get
import os
from dotenv import load_dotenv

load_dotenv()

# Make the API request using the environment variable for the API key
api_key = os.getenv('NewsAPI_key')
if not api_key:
    raise ValueError("API key not found. Please set the 'NewsAPI_key' environment variable.")

apirequest = get(f"https://newsapi.org/v2/everything?q=covid19&from=2024-11-18&to=2024-12-18&sortBy=publishedAt&language=en&apiKey={api_key}")

# Check if the request was successful
if apirequest.status_code == 200:
    data = apirequest.json()
    
    # Check if 'articles' key exists in the response
    if 'articles' in data:
        # Open a CSV file for writing
        with open('api_response.csv', mode='w', newline='') as file:
            writer = csv.writer(file)
            
            # Write the header
            writer.writerow(['source', 'author', 'title', 'description', 'url', 'publishedAt', 'content'])
            
            # Write the data
            for article in data['articles']:
                writer.writerow([
                    article['source']['name'],
                    article.get('author', 'N/A'),
                    article['title'],
                    article['description'],
                    article['url'],
                    article['publishedAt'],
                    article.get('content', 'N/A')
                ])
    else:
        print("The key 'articles' was not found in the response.")
else:
    print(f"Failed to fetch data. HTTP Status code: {apirequest.status_code}")

# Display the number of results
num_results = len(data['articles'])
print(f"Number of results: {num_results}")

Number of results: 37


# Web scrapping de texte

## Choisir une source web adaptée (par exemple, un blog ou un site d’actualités)

### 1 — Identifier les balises HTML contenant les informations à extraire (par exemple, les titres des articles)
### 2 — Extraire les données textuelles et les afficher
### 3 — Enregistrer les données extraites dans un fichier CSV

In [None]:
from bs4 import BeautifulSoup
import time
import requests
import csv

url = 'https://www.thelancet.com/collections/critical-care-other?parent=001548'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

articles = soup.find_all('input', class_='search_item_select')

data=[]
for idx, article in enumerate(articles, 1):
    title = article.text.strip()
    link = article['href']
    data.append([idx, title, link])

for item in data:
    print(f"{item[0]}: {item[1]} ({item[2]})")

with open('the_lancet.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['ID', 'Titre', 'Lien']) 
    writer.writerows(data)  

MissingSchema: Invalid URL 'YOUR_TARGET_URL_HERE': No scheme supplied. Perhaps you meant https://YOUR_TARGET_URL_HERE?

# TD 2 Nettoyage des Données Textuelles en Python pour le NLP

### — Identifier et supprimer les caractères spéciaux et hyperliens.
### — Convertir les textes en minuscules.
### — Supprimer les espaces inutiles et normaliser le format.

In [36]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords


df = pd.read_csv('./api_response.csv')


def text_clean(text):
    if isinstance(text, str):
        # caps to lowercase
        text = text.lower()
        # del URLs
        text = re.sub(r'http[s]?://\S+', '', text) 
        # del non-letters
        text = re.sub(r'[^a-z\s]', '', text)
        # del useless spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
    return text

# clean columns 'title', 'description' and 'content'
df['title'] = df['title'].apply(text_clean)
df['description'] = df['description'].apply(text_clean)
df['content'] = df['content'].apply(text_clean)

# save the cleaned data
df.to_csv('articles_cleaned.csv', index=False)

# print the first 5 rows of  cleaned data
print(df.head())


              source                                             author  \
0  Project Syndicate                                     Kenneth Rogoff   
1   Freerepublic.com                                       Daily Signal   
2           Plos.org  Iheanyi Oby Nwaoha, Albain Ayime Balibuno, Nuh...   
3  Project Syndicate                                  Michael R. Strain   
4      Wordpress.com                                                NaN   

                                               title  \
0                 se hundir el segundo boom de trump   
1  exclusive what has hhs withheld on covid vax s...   
2  factors associated with the uptake and utilisa...   
3                            el populismo nunca dura   
4                       sam thursfield status update   

                                         description  \
0          project syndicate the worlds opinion page   
1  a new letter from sen ron johnson rwis puts fe...   
2  introduction diabetic retinopathy dr is a

### — Divisez les textes en unités lexicales (tokens) à l’aide d’une bibliothèque comme NLTK.
### — Effectuez une normalisation des mots (racine ou radical).
### — Comparez les résultats obtenus entre le stemming et la lemmatisation.

In [7]:
from nltk.tokenize import word_tokenize

text = "This is a test."
tokens = word_tokenize(text)
print(tokens)



LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/sh/nltk_data'
    - '/Users/sh/sklearn-env/nltk_data'
    - '/Users/sh/sklearn-env/share/nltk_data'
    - '/Users/sh/sklearn-env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/path/to/custom/nltk_data'
    - '/path/to/custom/nltk_data'
**********************************************************************


In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer


# nltk.data.path.append('/path/to/custom/nltk_data')  # Remplacez par un chemin valide
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# load csv
df = pd.read_csv('./articles_cleaned.csv')

#  divide text into tokens
def tokenizer(text):
    return word_tokenize(text)

# stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# stemming
def apply_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

# lemmatization
def apply_lemmatization(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# apply tokenizer, stemming, and lemmatization on text columns
df['title_tokens'] = df['title'].apply(tokenizer)
df['description_tokens'] = df['description'].apply(tokenizer)
df['content_tokens'] = df['content'].apply(tokenizer)

df['title_stemmed'] = df['title_tokens'].apply(apply_stemming)
df['description_stemmed'] = df['description_tokens'].apply(apply_stemming)
df['content_stemmed'] = df['content_tokens'].apply(apply_stemming)

df['title_lemmatized'] = df['title_tokens'].apply(apply_lemmatization)
df['description_lemmatized'] = df['description_tokens'].apply(apply_lemmatization)
df['content_lemmatized'] = df['content_tokens'].apply(apply_lemmatization)


print(df.head())


[nltk_data] Downloading package punkt to /Users/sh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/sh/nltk_data'
    - '/Users/sh/sklearn-env/nltk_data'
    - '/Users/sh/sklearn-env/share/nltk_data'
    - '/Users/sh/sklearn-env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/path/to/custom/nltk_data'
    - '/path/to/custom/nltk_data'
**********************************************************************
