In [1]:
## Imports:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator

In [2]:
# downloads sample twitter dataset.
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
twitter_samples

<TwitterCorpusReader in '/root/nltk_data/corpora/twitter_samples'>

In [4]:
allPositiveTweets = twitter_samples.strings('positive_tweets.json')
allNegativeTweets = twitter_samples.strings('negative_tweets.json')

In [5]:
##REPORT:
print('Number of Positive Tweets: ', len(allPositiveTweets))
print('Number of Negative Tweets: ', len(allNegativeTweets))
print('Type of Positive Tweets: ', type(allPositiveTweets))
print('Type of Negative Tweets: ', type(allNegativeTweets))

Number of Positive Tweets:  5000
Number of Negative Tweets:  5000
Type of Positive Tweets:  <class 'list'>
Type of Negative Tweets:  <class 'list'>


In [6]:
##To assign colors to sentences in print statement just add color code in beginning..
print('\033[92m' + allPositiveTweets[random.randint(0,5000)])
print('\033[91m' + allNegativeTweets[random.randint(0,5000)])

[92m@taylorswift13 @Caradelevingne @halstonsage @natandalex I CANT WAIT :)
[91mMovie marathon anyonneeee :(((( loner af


Presence of @ : account handlers and :-( emojies detected..

## Pre-Processing:

In [7]:
##Sampling:

tweet = allPositiveTweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [8]:
##importing stopwords:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

### To remove hyperlinks and account prefixes

In [10]:
print(tweet)

##removing: "RT"

tweetWithoutExtra = re.sub(r'^RT[\s]+', '', tweet) ## meaning: starts with RT with one or more spaces at the end..

##removing: URL:

tweetWithoutExtra = re.sub(r'https?://[\S]+', '', tweetWithoutExtra) ## meaning: https or http://anystring literal without whitespaces..

##removing: #:

tweetWithoutExtra = re.sub(r'#', '', tweetWithoutExtra)

print(tweetWithoutExtra)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i
My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 


In [11]:
## Tokenizer class:
tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
##preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons.
##reduce_len. By default, it is set to False. It specifies whether to replace repeated character sequences of length 3 or greater with sequences of length 3.

In [12]:
tweet_tokens = tokenizer.tokenize(tweetWithoutExtra)
print(tweet_tokens)

['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


In [13]:
## stopwords.words('english') contains all the stopwords in english.. and string.punctuation contains punctuations
tweets_clean = []
for word in tweet_tokens:
  if word not in stopwords.words('english') and word not in string.punctuation:
    tweets_clean.append(word)

print(tweets_clean)

['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


In [14]:
##STEMMING:

##STEMMER CLASS:stemmer = PorterStemmer()==> method: stemmer.stem(word)

stemmer = PorterStemmer()

stemmed_tokens = []

for word in tweets_clean:
  stemmed_word = stemmer.stem(word)
  stemmed_tokens.append(stemmed_word)

print(stemmed_tokens)

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


In [16]:
# pip install utils

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1


The things done above can be done with just a single function: process_tweet in library..

In [22]:
##preprocess with library:

def process_tweet(tweet):
  stemmer = PorterStemmer()
  stopwords_english = stopwords.words('english')
  tweet = re.sub(r'\$\w*', '', tweet)
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)
  tokenizer = TweetTokenizer(preserve_case=False,        strip_handles=True,reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
      if (word not in stopwords_english and  
              word not in string.punctuation): 
          stem_word = stemmer.stem(word)  # stemming word
          tweets_clean.append(stem_word)

  return tweets_clean

# choose the same tweet
tweet = allPositiveTweets[2277]

print()
print('\033[92m')
print(tweet)
print('\033[94m')

# call the imported function
tweets_stem = process_tweet(tweet); # Preprocess a given tweet

print('preprocessed tweet:')
print(tweets_stem) # Print the result


[92m
My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i
[94m
preprocessed tweet:
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']
