In [1]:
import numpy as np
import re
import nltk
from nltk import download

from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


download(['twitter_samples', 'averaged_perceptron_tagger', 'wordnet'])

[nltk_data] Downloading package twitter_samples to

[nltk_data]     C:\Users\MEHR\AppData\Roaming\nltk_data...

[nltk_data]   Package twitter_samples is already up-to-date!

[nltk_data] Downloading package averaged_perceptron_tagger to

[nltk_data]     C:\Users\MEHR\AppData\Roaming\nltk_data...

[nltk_data]   Package averaged_perceptron_tagger is already up-to-

[nltk_data]       date!

[nltk_data] Downloading package wordnet to

[nltk_data]     C:\Users\MEHR\AppData\Roaming\nltk_data...

[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# prepare set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# putting all tweets together
all_tweets = all_positive_tweets + all_negative_tweets   

In [3]:
# creating y

# assigning 1 to positive tweets
# assigning 0 to nagative tweets
y = [ 1 for _ in range(len(all_positive_tweets)) ] + [ 0 for _ in range(len(all_negative_tweets)) ]



print('y : ')
print(y)
print(len(y))
print(type(y))

y : 

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [4]:
# creating X

In [5]:
# step1 : preprocessing the tweets

# normalizing

def normalizer(text):
    
    # Convert to lowercase
    normalized_text = text.lower()
    
    # Remove links, tags, and hashtags using regular expressions
    normalized_text = re.sub(r"http\S+|www\S+|https\S+|@\w+|#\w+", "", normalized_text)
    
    # Remove special characters and punctuations
    normalized_text = re.sub(r'[^a-zA-Z0-9\s]', '', normalized_text)
    
    # Remove extra whitespaces
    normalized_text = re.sub(r'\s+', ' ', normalized_text).strip()
    
    return normalized_text


test_text = normalizer(all_tweets[0])
test_text

'for being top engaged members in my community this week'

In [6]:
# customizing stop words
default_stopwords = set(stopwords.words('english'))
customized_stopwords = default_stopwords - set(['no', 'not', 'ain','aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"])
        

In [7]:
def preprocessor(all_tweets):
    
    processed_tweets = []
    
    for tweet in all_tweets:
        
        # normalizing
        normalized_tweet = normalizer(tweet)
    
        # tokenize into words
        tokenised_tweet = word_tokenize(normalized_tweet)
    
        # deleting stop words
        filtered_tokens = [token for token in tokenised_tweet if token not in customized_stopwords]
            
        # pos tagging
        tagged_tokens = nltk.pos_tag(filtered_tokens)
        
        # lemmatizing
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        root_tokens = []
        for token in tagged_tokens:
            if token[1] == 'JJ':
                root = stemmer.stem(token[0])
            else:
                root = lemmatizer.lemmatize(token[0])
                
            root_tokens.append(root)
            
        # joining tokens together 
        joined_tweet = ' '.join(root_tokens)
        
        
        processed_tweets.append(joined_tweet)
        
    return processed_tweets

        

processed_tweets = preprocessor(all_tweets)


In [8]:
processed_tweets

['top engaged member community week',
 'hey james odd please call contact centre 02392441234 abl assist mani thanks',
 'listen last night bleed amaz track scotland',
 'congrats',
 'yeaaaah yippppi accnt verified rqst succeed got blue tick mark fb profile 15 day',
 'one irresist',
 'dont like keep love customer waiting long hope enjoy happi friday lwwf',
 'second thought there not enough time dd new short entering system sheep must buying',
 'jgh go bayan bye',
 'act mischievousness calling etl layer inhouse warehousing app katamari well name implies p',
 'top influencers community week',
 'wouldnt love bigjuicyselfies',
 'follow amp follow u back',
 'perfect already know whats waiting',
 'great new opportunity junior triathletes aged 12 13 gatorade series get entry',
 'laying greeting card range print today love job',
 'friend lunch yummmm',
 'id conflict thanks help here screenshot working',
 'hi liv',
 'hello need know something u fm twitter sure thing dm x',
 'top new follower commu

In [9]:
# Step2: convert the text data to numerical type (TF_IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_tweets)

In [10]:
print(X)
print(X.shape)
print(type(X))

  (0, 10181)	0.3469001392630045

  (0, 2103)	0.43962977237963613

  (0, 5962)	0.4632690043353587

  (0, 3089)	0.5324195618774463

  (0, 9551)	0.4339322485931311

  (1, 9325)	0.15990735295854877

  (1, 5801)	0.24143353406276638

  (1, 871)	0.374666574301084

  (1, 362)	0.2655182799110355

  (1, 12)	0.374666574301084

  (1, 1787)	0.3473795007035719

  (1, 2180)	0.30097947696652305

  (1, 1667)	0.247169760115827

  (1, 7226)	0.1761224321497117

  (1, 6648)	0.3473795007035719

  (1, 4890)	0.3041305122970968

  (1, 4320)	0.2198826865183149

  (2, 8172)	0.44061068818851695

  (2, 9587)	0.4165334223549634

  (2, 628)	0.3523570944672394

  (2, 1352)	0.4667791762015617

  (2, 6480)	0.28179968864867055

  (2, 5350)	0.2791028644398163

  (2, 5505)	0.3647113032214677

  (3, 2151)	1.0

  :	:

  (9994, 10072)	0.6471432634295546

  (9994, 7460)	0.5528800060771873

  (9994, 8678)	0.3875519316230356

  (9994, 4604)	0.35402372204852206

  (9995, 9908)	0.5768400885931938

  (9995, 929)	0.5329592034836329

In [11]:
# creating the model

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [12]:
# predicting the test set
y_pred = model.predict(X_test)

# Evaluating the model
print(f1_score(y_test, y_pred))


0.74012474012474


In [18]:
# test self_made tweet

X = ['I hate posts about this', 'Im happy I saw this']
y = [0, 1]

X = preprocessor(X)
X = vectorizer.transform(X)
y_pred = model.predict(X)

print('y_pred :')
print(y_pred)
print('----------------------------------')
print('f1_score')
print(f1_score(y, y_pred))

y_pred :

[0 1]

----------------------------------

f1_score

1.0
