In [11]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import re                                  # library for regular expression operations
import string                              # for string operations
import numpy as np
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming 
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# 1 - preparations de nos données

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\stive\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [54]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

x_train = train_pos + train_neg 
x_test = test_pos + test_neg

In [8]:
# combine positive and negative labels
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [9]:
# Print the shape train and test sets
print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))

y_train.shape = (8000, 1)
y_test.shape = (2000, 1)


# 2 - definissons nos fonctions

In [12]:
def process_tweet(tweet):
    """
        Input : 
            tweet : est une liste de tweet brut
        Output:
            array of word : tokenizes the tweet into individual words, removes stop words and applies stemming
    """
    # remove old style retweet text "RT"
    tweet2 = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet2 = re.sub(r'#', '', tweet2)
    
    # instantiate tokenizer class
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                   reduce_len=True)

    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet2)
    
    stopwords_english = stopwords.words('english') 
    
    tweets_clean = []

    for word in tweet_tokens: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            tweets_clean.append(word)
            
    # Instantiate stemming class
    stemmer = PorterStemmer()

    # Create an empty list to store the stems
    tweets_stem = [] 

    for word in tweets_clean:
        stem_word = stemmer.stem(word)  # stemming word
        tweets_stem.append(stem_word)  # append to the list
        
    return tweets_stem


In [40]:
def build_freqs(text,label):
    """
        cette fonction va nous permettre de compter la frequence de chaque mot dans nos tweet 
        selon qu'il est utilisé dans un tweet positif ou dans un tweet négatif
    """
    freqs = {}
    
    for y,tweet in zip(label, text):
        
        for word in process_tweet(tweet):
            pair = (word, y)
           
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [14]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: un vecteur de taille(1,3)
            qui va compter le nombre de mot positif/négatif du tweet biensur en tenant compte de leur poids
            dans le dictionnaire freqs
        
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)
        

    return x

In [15]:
def sigmoid(z): 
    return 1/(1+np.exp(-1*z))


In [16]:

def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: final weight vector
    '''

    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = x@theta
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1.0/m*((y.T)@np.log(h) + ((1-y).T)@np.log(1-h))
        print("epochs {}/{} ======> loss : {}\n".format(i,num_iters,J[0,0]))

        # update the weights theta
        theta = theta - (alpha/m)*(x.T@(h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [17]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''

    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(x@theta)
    
    return y_pred

# 3 - testons nos fonctions

In [22]:
#example

tweet = x_train[0]
result = process_tweet(tweet)
print("Notre tweet initial:\n{}\n\nNotre tweet après traitement:\n{}".format(tweet,result))

Notre tweet initial:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Notre tweet après traitement:
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [23]:
np.random.seed(1)
# X c'est une matrice 10*3 avec la premiere colone qui est le biais
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")





























































































































































































































































































































































































































































































































































































































































































































The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


In [41]:
# create frequency dictionary
freqs = build_freqs(x_train,y_train.reshape(8000,))

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11339


## 3.1 - extractions des features

In [47]:
# x va contenir nos différentes features
# cad pour chaque tweet nous alons compter
x = np.zeros((len(x_train), 3))
for i in range(len(x_train)):
    x[i, :]= extract_features(x_train[i], freqs)


y = y_train

# 4 - entrainons notre modèles

In [48]:

# Apply gradient descent
J, theta = gradientDescent(x, y, np.zeros((3, 1)), 1e-9,3000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































The cost after training is 0.17547371.
The resulting vector of weights is [1.4e-07, 0.00077055, -0.00074612]


# 5 - faisons quelques prédictions

In [65]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great','bad','great great', 'great great great', 'great great great bad']:
    prediction = predict_tweet(tweet, freqs, theta)
    if prediction > 0.5:
        print("{} --> {:.3f} --> positif".format(tweet,prediction[0,0]))
    else:
        print("{} --> {:.3f} --> negatif".format(tweet,prediction[0,0]))

I am happy --> 0.528 --> positif
I am bad --> 0.493 --> negatif
this movie should have been great. --> 0.523 --> positif
great --> 0.523 --> positif
bad --> 0.493 --> negatif
great great --> 0.546 --> positif
great great great --> 0.569 --> positif
great great great bad --> 0.561 --> positif


# 6 - evaluons notre modèle

In [52]:
def test_logistic_regression(x_test, y_test, freqs, theta):
    """
    Input: 
        x_test: a list of tweets
        y_test: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    
    # liste des prédictions de notre modèle
    y_hat = []
    
    for tweet in x_test:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0)

    #
    accuracy = sum((np.array(y_hat) == test_y.reshape(test_y.shape[0],)).astype(int))/test_y.shape[0]
    
    return accuracy

In [55]:
accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {accuracy:.4f}")

Logistic regression model's accuracy = 0.9945


# nous avons une accuracy de 0.9945 ce qui est très bien 