In [39]:
import nltk  # Bibliothèque NLP (traitement du langage naturel)
import matplotlib.pyplot as plt
import numpy as np
import re  # Expressions régulières pour le nettoyage de texte
import string  # Manipulation de la ponctuation (ex : suppression des signes de ponctuation)

from nltk.corpus import stopwords, twitter_samples  # Liste des mots vides (ex : "le", "de", "the") à retirer
from nltk.stem import PorterStemmer  # Algorithme de stemming pour réduire les mots à leur racine
from nltk.stem import WordNetLemmatizer  # Lemmatisation : transformation des mots à leur forme canonique
from nltk.tokenize import TweetTokenizer  # Tokenizer adapté aux tweets (hashtags, mentions, emojis)
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("wordnet")  # Téléchargement de WordNet, nécessaire pour la lemmatisation
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gpres\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gpres\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\gpres\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [18]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [20]:
#Fussion des positifs et negatif
tweets = pos_tweets + neg_tweets

In [21]:
print("nombre de tweets:", len(tweets))
print("nombre de tweets positifs : ", len(pos_tweets))
print("nombre de tweets negatifs : ", len(neg_tweets))

nombre de tweets: 10000
nombre de tweets positifs :  5000
nombre de tweets negatifs :  5000


In [23]:
#Illustration de la segmentation
tweet = pos_tweets[1510]

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)

In [24]:
print("Tweet")
print(tweet)

Tweet
@TheBookwormBaby Thank you! that's very kind.  I just finished working on more Little Explorers books btw which will be out next year.. :)


In [25]:
print("Tweet segement")
print(tweet_tokens)

Tweet segement
['thank', 'you', '!', "that's", 'very', 'kind', '.', 'i', 'just', 'finished', 'working', 'on', 'more', 'little', 'explorers', 'books', 'btw', 'which', 'will', 'be', 'out', 'next', 'year', '..', ':)']


In [26]:
#Suppression des stopwords et ponctuation
tweet_clean = []

stopwords_english = stopwords.words("english")

for word in tweet_tokens:
    if (word not in stopwords_english and 
        word not in string.punctuation):
        tweet_clean.append(word)
        
print (tweet_clean)
        
        

['thank', "that's", 'kind', 'finished', 'working', 'little', 'explorers', 'books', 'btw', 'next', 'year', '..', ':)']


In [27]:
lemmatizer = WordNetLemmatizer()
tweet_clean_and_lem = [lemmatizer.lemmatize(word) for word in tweet_clean]

print(tweet_clean_and_lem)

['thank', "that's", 'kind', 'finished', 'working', 'little', 'explorer', 'book', 'btw', 'next', 'year', '..', ':)']


In [29]:
#Fonction de preprocessing
def traiter_tweet (tweet):
    """
    Input:
        tweet
    Output:
        une liste des mots du tweet après pré-traitement
    """
    
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words("english")
    
    #suppression du #
    tweet = re.sub(r'#', '', tweet)
    
    #Segmentation
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweet_clen = []
    
    for word in tweet_tokens :
        if (word not in stopwords_english and
            word not in string.punctuation):
            
            lem_word = lemmatizer.lemmatize(word)
            tweet_clean.append(lem_word)
            
    return tweet_clean

In [34]:
def freq_words (tweets, labelArray):
    """ compte de fréquences.
    Input:
        tweets
        labelArray: un tableau avec le label pour chaque tweet (1/0)
    Output:
        freqs: un dictionnaire de paires (mot, label) et leurs fréquences
    """
    
    labelList = np.squeeze(labelArray).tolist()
    freqs = {}
    for y, tweet in zip(labelList, tweets):
        for word in traiter_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs    

In [35]:
# Création des labels correspondants
labels_pos = [1]*len(pos_tweets)
labels_neg = [0]*len(neg_tweets)

# Concaténation des tweets et des labels
labels = labels_pos + labels_neg

In [36]:
#Extraire les fréquences
freqs = freq_words(tweets, labels)

In [37]:
freqs

{('thank', 1): 589522,
 ("that's", 1): 162098,
 ('kind', 1): 71148,
 ('finished', 1): 40395,
 ('working', 1): 97705,
 ('little', 1): 89334,
 ('explorers', 1): 5000,
 ('books', 1): 5000,
 ('btw', 1): 29237,
 ('next', 1): 124284,
 ('year', 1): 120851,
 ('..', 1): 314813,
 (':)', 1): 9280270,
 ('followfriday', 1): 114897,
 ('top', 1): 136418,
 ('engaged', 1): 35433,
 ('member', 1): 55341,
 ('community', 1): 126275,
 ('week', 1): 249909,
 ('hey', 1): 179844,
 ('james', 1): 23769,
 ('odd', 1): 4999,
 (':/', 1): 18040,
 ('please', 1): 249365,
 ('call', 1): 70099,
 ('contact', 1): 17304,
 ('centre', 1): 5996,
 ('02392441234', 1): 4999,
 ('able', 1): 22355,
 ('assist', 1): 4999,
 ('many', 1): 88061,
 ('thanks', 1): 993760,
 ('listen', 1): 34720,
 ('last', 1): 121329,
 ('night', 1): 180428,
 ('bleed', 1): 8155,
 ('amazing', 1): 127153,
 ('track', 1): 20497,
 ('scotland', 1): 9856,
 ('congrats', 1): 49029,
 ('yeaaah', 1): 4996,
 ('yipppy', 1): 4996,
 ('accnt', 1): 9885,
 ('verified', 1): 4996,
 

In [38]:
len(freqs)

21748

In [42]:
#extraction de features
def extract_features(tweet, freqs):
    tweet_words = traiter_tweet(tweet)
    
    pos = 0
    neg = 0
    
    for word in tweet_words:
        pos += freqs.get((word, 1), 0)
        neg += freqs.get((word, 0), 0)
        
    return [1, pos, neg]

In [44]:
#Création de la matrice BoW
test = extract_features(tweet, freqs)
print(test)

[1, 84709522292, 234081764394]


In [45]:
#X et y pour entraînement du modele
X = np.array([extract_features(tweet, freqs) for tweet in tweets])
y = np.array(labels)

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
model = LogisticRegression()

In [50]:
model.fit(X, y)

In [66]:
tweet_test = "I am comming"

In [67]:
features_test = np.array([extract_features(tweet_test, freqs)])

In [68]:
prediction = model.predict(features_test)

In [69]:
proba = model.predict_proba(features_test)

In [70]:
print("Sentiment :", "Positif" if prediction[0] == 1 else "Négatif")
print("Probabilité :", proba)

Sentiment : Négatif
Probabilité : [[0.97776226 0.02223774]]
