In [1]:
!pip install nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


import os

# Création d'un répertoire local au cas où les chemins par défaut posent problème
nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)

# Ajout manuel au chemin
nltk.data.path.append(nltk_data_dir)

# Téléchargements des ressources nécessaires dans le bon répertoire
nltk.download("punkt", download_dir=nltk_data_dir)
nltk.download("stopwords", download_dir=nltk_data_dir)
nltk.download("wordnet", download_dir=nltk_data_dir)

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m[31m10.1 MB/s[0m eta [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.7/781.7 KB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6


[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/daniel/nltk_data...
[nltk_data] Downloading package punkt to /home/daniel/Bureau/HAI817I -
[nltk_data]     Machine Learning/Machine_learning/Projet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/daniel/Bureau/HAI817I - Machine
[nltk_data]     Learning/Machine_learning/Projet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/daniel/Bureau/HAI817I
[nltk_data]     - Machine
[nltk_data]     Learning/Machine_learning/Projet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df=pd.read_csv('scitweets_export.tsv', sep='\t')
display (df.head())
print("Nombre total de lignes :", len(df))

Unnamed: 0.1,Unnamed: 0,tweet_id,text,science_related,scientific_claim,scientific_reference,scientific_context
0,0,316669998137483264,Knees are a bit sore. i guess that's a sign th...,0,0.0,0.0,0.0
1,1,319090866545385472,McDonald's breakfast stop then the gym 🏀💪,0,0.0,0.0,0.0
2,2,322030931022065664,Can any Gynecologist with Cancer Experience ex...,1,1.0,0.0,0.0
3,3,322694830620807168,Couch-lock highs lead to sleeping in the couch...,1,1.0,0.0,0.0
4,4,328524426658328576,Does daily routine help prevent problems with ...,1,1.0,0.0,0.0


Nombre total de lignes : 1140


In [5]:
df_sci = df[df["science_related"] == 1]
print("Nombre de lignes scientifiques :", len(df_sci))

Nombre de lignes scientifiques : 375


In [17]:
sc_bool = df[df["science_related"] == 1]

nb_claim = sc_bool[sc_bool["scientific_claim"] == 1.0]["tweet_id"].count()
nb_ref = sc_bool[sc_bool["scientific_reference"] == 1.0]["tweet_id"].count()
nb_context = sc_bool[sc_bool["scientific_context"] == 1.0]["tweet_id"].count()

nb_claim_ref = sc_bool[(sc_bool["scientific_claim"] == 1.0) & (sc_bool["scientific_reference"] == 1.0) & (sc_bool["scientific_context"] == 0.0) ]["tweet_id"].count()
nb_claim_context = sc_bool[(sc_bool["scientific_claim"] == 1.0) & (sc_bool["scientific_context"] == 1.0) & (sc_bool["scientific_reference"] == 0.0)]["tweet_id"].count()
nb_context_ref = sc_bool[(sc_bool["scientific_context"] == 1.0) & (sc_bool["scientific_reference"] == 1.0) & (sc_bool["scientific_claim"] == 0.0)]["tweet_id"].count()

nb_total = sc_bool[(sc_bool["scientific_claim"] == 1.0) |
                   (sc_bool["scientific_reference"] == 1.0) |
                   (sc_bool["scientific_context"] == 1.0)]["tweet_id"].count()


print(f"Nombre de tweets CLAIM : {nb_claim}")
print(f"Nombre de tweets REFERENCE : {nb_ref}")
print(f"Nombre de tweets CONTEXT : {nb_context}")
print(f"Nombre de tweets CLAIM & REF : {nb_claim_ref}")
print(f"Nombre de tweets CLAIM & CONTEXT : {nb_claim_context}")
print(f"Nombre de tweets CONTEXT & REF : {nb_context_ref}")
print(f"Nombre total de tweets ayant au moins une catégorie : {nb_total}")

Nombre de tweets CLAIM : 263
Nombre de tweets REFERENCE : 203
Nombre de tweets CONTEXT : 251
Nombre de tweets CLAIM & REF : 0
Nombre de tweets CLAIM & CONTEXT : 15
Nombre de tweets CONTEXT & REF : 79
Nombre total de tweets ayant au moins une catégorie : 375


In [7]:
nb_claim_only = sc_bool[(sc_bool["scientific_claim"] == 1.0) & 
                        (sc_bool["scientific_reference"] == 0.0) & 
                        (sc_bool["scientific_context"] == 0.0)]["tweet_id"].count()


nb_ref_only =  sc_bool[(sc_bool["scientific_claim"] == 0.0) & 
                        (sc_bool["scientific_reference"] == 1.0) & 
                        (sc_bool["scientific_context"] == 0.0)]["tweet_id"].count()

            
nb_context_only = sc_bool[(sc_bool["scientific_claim"] == 0.0) & 
                        (sc_bool["scientific_reference"] == 0.0) & 
                        (sc_bool["scientific_context"] == 1.0)]["tweet_id"].count()


nb_total = sc_bool[(sc_bool["scientific_claim"] == 1.0) &
                   (sc_bool["scientific_reference"] == 1.0) &
                   (sc_bool["scientific_context"] == 1.0)]["tweet_id"].count()


print(f"Nombre de tweets CLAIM : {nb_claim_only}")
print(f"Nombre de tweets REFERENCE : {nb_ref_only}")
print(f"Nombre de tweets CONTEXT : {nb_context_only}")

print(f"Nombre total de tweets ayant les trois catégories : {nb_total}")

Nombre de tweets CLAIM : 124
Nombre de tweets REFERENCE : 0
Nombre de tweets CONTEXT : 33
Nombre total de tweets ayant les trois catégories : 124


In [18]:
# Exemple : créer une liste de labels par tweet
def get_labels(row):
    labels = []
    if row["scientific_claim"] == 1.0:
        labels.append("CLAIM")
    if row["scientific_reference"] == 1.0:
        labels.append("REF")
    if row["scientific_context"] == 1.0:
        labels.append("CONTEXT")
    return labels

df_sci["labels"] = df_sci.apply(get_labels, axis=1)

print(df_sci["labels"].value_counts())



labels
[CLAIM]                  124
[CLAIM, REF, CONTEXT]    124
[REF, CONTEXT]            79
[CONTEXT]                 33
[CLAIM, CONTEXT]          15
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sci["labels"] = df_sci.apply(get_labels, axis=1)


In [19]:
from sklearn.utils import resample

# Déterminer la taille maximale (classe la plus fréquente)
max_size = df_sci['labels'].value_counts().max()
min_size = df_sci['labels'].value_counts().min()

print(max_size)
print(min_size)

124
15


In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_sci["labels"])


In [8]:
from sklearn.model_selection import train_test_split

# X = texte du tweet
X = df_sci['text']

# Y = vecteurs binaires pour les 3 classes
Y = df_sci[['scientific_claim', 'scientific_reference', 'scientific_context']]

# Split train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train_vect, Y_train)


In [11]:
from sklearn.metrics import classification_report

Y_pred = clf.predict(X_test_vect)
print(classification_report(Y_test, Y_pred, target_names=['CLAIM', 'REF', 'CONTEXT']))


              precision    recall  f1-score   support

       CLAIM       0.74      1.00      0.85        54
         REF       0.59      0.95      0.73        39
     CONTEXT       0.65      1.00      0.79        48

   micro avg       0.66      0.99      0.79       141
   macro avg       0.66      0.98      0.79       141
weighted avg       0.67      0.99      0.79       141
 samples avg       0.66      0.99      0.76       141

