In [1]:
pip install pandas scikit-learn nltk cryptography


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 8.5 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.5 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 1.6 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 874.3 kB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\BELAJE\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
# secure_nlp_pipeline.py

import pandas as pd
import numpy as np
import nltk
import logging
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [3]:
# Initialisation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BELAJE\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BELAJE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BELAJE\AppData\Roaming\nltk_data...


True

In [4]:
# Chargement des données
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [5]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
# Nettoyage
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['cleaned'] = df['message'].apply(clean_text)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\BELAJE/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\BELAJE\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned'])
y = df['label'].map({'ham': 0, 'spam': 1})  # binaire pour simplification

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle
model = LogisticRegression()
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

## Partie 2 — Sécurisation du Pipeline

### a. Sécurisation des données (chiffrement Fernet)

In [None]:
from cryptography.fernet import Fernet

# Générer une clé
fernet_key = Fernet.generate_key()
cipher = Fernet(fernet_key)

# Simuler une colonne sensible (user_id)
df['user_id'] = np.random.randint(1000, 9999, size=len(df))

# Chiffrer les IDs utilisateurs
def encrypt_id(x):
    return cipher.encrypt(str(x).encode()).decode()

df['user_id_encrypted'] = df['user_id'].apply(encrypt_id)

# Prévention réidentification : suppression colonne en clair
df = df.drop(columns=['user_id'])

# Export sécurisé (colonnes utiles seulement)
df[['label', 'message', 'user_id_encrypted']].to_csv("secure_dataset.csv", index=False)


### b. Gestion des accès (simulation par rôles
  * Script pour Data Scientist (accès complet)

In [None]:
# access_role_data_scientist.py
import pandas as pd
from datetime import datetime

df = pd.read_csv("secure_dataset.csv")

# Journalisation
with open("log_access.txt", "a") as log:
    log.write(f"[{datetime.now()}] DATA SCIENTIST loaded dataset\n")

print(df.head())


* Script pour Analyste (accès aux prédictions uniquement)

In [None]:
# access_role_analyst.py
from datetime import datetime
import joblib
import pandas as pd

# Journalisation
with open("log_access.txt", "a") as log:
    log.write(f"[{datetime.now()}] ANALYST requested prediction\n")

# Exemple de prédiction
sample = ["I hate this product, it is terrible!"]
cleaned = clean_text(sample[0])
X_sample = vectorizer.transform([cleaned])
pred = model.predict(X_sample)

print("Prediction:", "NEGATIVE" if pred[0] else "POSITIVE")
