In [1]:
# importing all the relevant libraries
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator

import numpy as np
import pandas as pd
import re
import string

In [2]:
# downloads sample twitter dataset.
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\MOHAMMED
[nltk_data]     USAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
##Setting up all functions

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks    
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

 
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

# Functions to convert data into suitable form
def extract_features(tweet):
    x = np.zeros((1, 3)) 
    
    x[0,0] = 1 
    ### START CODE HERE ###
    
    # loop through each word in the list of words
    for word in tweet:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word,1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)
        
    return np.array([x[0,1],x[0,2]])



def token_splitter_pos(a):
    return a[0]

def token_splitter_neg(a):
    return a[1]

In [4]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set) 
train_pos  = all_positive_tweets[:4000]
train_neg  = all_negative_tweets[:4000]

test_pos  = all_positive_tweets[4000:]
test_neg  = all_negative_tweets[4000:]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg 

train_y = np.append(np.ones((len(train_pos))), np.zeros((len(train_neg))))
test_y = np.append(np.ones((len(test_pos))), np.zeros((len(test_neg))))

print("Number of tweets: ", len(train_x))
print("Number of tweets: ", len(test_x))

Number of tweets:  8000
Number of tweets:  2000


In [5]:
freqs = build_freqs(train_x, train_y)

In [6]:
train_x = pd.DataFrame(train_x)
train_x.rename(columns = {0:'Tweet'}, inplace = True)
train_x['Tweet_arrayed'] = train_x['Tweet'].apply(process_tweet)

test_x = pd.DataFrame(test_x)
test_x.rename(columns = {0:'Tweet'}, inplace = True)
test_x['Tweet_arrayed'] = test_x['Tweet'].apply(process_tweet )

In [7]:
train_x['Tweet_tokenised'] = train_x['Tweet_arrayed'].apply(extract_features)
train_x['bias']=1
train_x['postive'] = train_x['Tweet_tokenised'].apply(token_splitter_pos)
train_x['negative'] = train_x['Tweet_tokenised'].apply(token_splitter_neg)
train_x

Unnamed: 0,Tweet,Tweet_arrayed,Tweet_tokenised,bias,postive,negative
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,"[followfriday, top, engag, member, commun, wee...","[3133.0, 61.0]",1,3133.0,61.0
1,@Lamb2ja Hey James! How odd :/ Please call our...,"[hey, jame, odd, :/, pleas, call, contact, cen...","[3705.0, 444.0]",1,3705.0,444.0
2,@DespiteOfficial we had a listen last night :)...,"[listen, last, night, :), bleed, amaz, track, ...","[3119.0, 116.0]",1,3119.0,116.0
3,@97sides CONGRATS :),"[congrat, :)]","[2975.0, 4.0]",1,2975.0,4.0
4,yeaaaah yippppy!!! my accnt verified rqst has...,"[yeaaah, yipppi, accnt, verifi, rqst, succeed,...","[3232.0, 226.0]",1,3232.0,226.0
...,...,...,...,...,...,...
7995,Amelia didnt stalk my twitter :(,"[amelia, didnt, stalk, twitter, :(]","[29.0, 3718.0]",1,29.0,3718.0
7996,"oh, i missed the broadcast. : (","[oh, miss, broadcast]","[62.0, 323.0]",1,62.0,323.0
7997,i really can't stream on melon i feel useless :-(,"[realli, can't, stream, melon, feel, useless, ...","[144.0, 793.0]",1,144.0,793.0
7998,I need to stop looking at old soccer pictures :(,"[need, stop, look, old, soccer, pictur, :(]","[207.0, 3902.0]",1,207.0,3902.0


In [8]:
test_x['Tweet_tokenised'] = test_x['Tweet_arrayed'].apply(extract_features)
test_x['bias']=1
test_x['postive'] = test_x['Tweet_tokenised'].apply(token_splitter_pos)
test_x['negative'] = test_x['Tweet_tokenised'].apply(token_splitter_neg)
test_x

Unnamed: 0,Tweet,Tweet_arrayed,Tweet_tokenised,bias,postive,negative
0,"Bro:U wan cut hair anot,ur hair long Liao bo\n...","[bro, u, wan, cut, hair, anot, ur, hair, long,...","[3397.0, 414.0]",1,3397.0,414.0
1,@heyclaireee is back! thnx God!!! i'm so happy :),"[back, thnx, god, i'm, happi, :)]","[3415.0, 397.0]",1,3415.0,397.0
2,@BBCRadio3 thought it was my ears which were m...,"[thought, ear, malfunct, thank, good, clear, o...","[1392.0, 329.0]",1,1392.0,329.0
3,@HumayAG 'Stuck in the centre right with you. ...,"[stuck, centr, right, clown, right, joker, lef...","[3280.0, 401.0]",1,3280.0,401.0
4,Happy Friday :-) http://t.co/iymPIlWXFY,"[happi, friday, :-)]","[805.0, 27.0]",1,805.0,27.0
...,...,...,...,...,...,...
1995,I wanna change my avi but uSanele :(,"[wanna, chang, avi, usanel, :(]","[45.0, 3776.0]",1,45.0,3776.0
1996,MY PUPPY BROKE HER FOOT :(,"[puppi, broke, foot, :(]","[3.0, 3686.0]",1,3.0,3686.0
1997,where's all the jaebum baby pictures :((,"[where', jaebum, babi, pictur, :(]","[26.0, 3729.0]",1,26.0,3729.0
1998,But but Mr Ahmad Maslan cooks too :( https://t...,"[mr, ahmad, maslan, cook, :(]","[7.0, 3684.0]",1,7.0,3684.0


In [9]:
X_train = train_x[['bias', 'postive','negative']].values
y_train = train_y
X_test = test_x[['bias', 'postive','negative']].values
y_test = test_y

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.99
Confusion Matrix:
[[993   7]
 [  5 995]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1000
         1.0       0.99      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

