In [46]:
# imports
import json
import re
import string
from pprint import pprint as print

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [47]:
# load data

with open('data/tweets.json', encoding='utf-8') as file_in:
    tweets = json.load(file_in)['tweets']

print(tweets[0])

{'conversation_id': '913599329369280513',
 'created_at': '2017-09-29',
 'data.public_metrics.like_count': 0,
 'data.public_metrics.quote_count': 0,
 'data.public_metrics.reply_count': 0,
 'data.public_metrics.retweet_count': 3,
 'day': 29,
 'in_reply_to_user_id': '',
 'lang': 'en',
 'month': 9,
 'possibly_sensitive': False,
 'reply_settings': 'everyone',
 'source': 'Twitter for iPhone',
 'text': 'RT @forty3north: And congratulations to our runners-up @CooltureUSA '
         'and Pneumaglide! #THEPITCH',
 'tweet_id': '913599329369280513',
 'user_id': '418101032',
 'year': 2017}


In [48]:
# clean unwanted fields
tweets_cleaned = []

for tweet in tweets:
    tweets_cleaned.append(
        dict(
            text=tweet['text'],
            tweet_id=tweet['tweet_id'],
            conversation_id=tweet['conversation_id'],
        )
    )

print(tweets_cleaned[:3])

[{'conversation_id': '913599329369280513',
  'text': 'RT @forty3north: And congratulations to our runners-up @CooltureUSA '
          'and Pneumaglide! #THEPITCH',
  'tweet_id': '913599329369280513'},
 {'conversation_id': '913599235077083136',
  'text': 'RT @MarnieLaVigne2: @Launch_NY client @CooltureUSA wins $5k! '
          'https://t.co/Sx38wpPw7l',
  'tweet_id': '913599235077083136'},
 {'conversation_id': '913599211425419265',
  'text': 'RT @AshleyroweWKBW: Congratulations to @ZandraBeauty for winning '
          'top prize at #ThePitch @forty3north ! A rising star. Our full story '
          'on Zan…',
  'tweet_id': '913599211425419265'}]


In [49]:
# remove duplicates
unique_tweet_ids = set()
duplicates = []

for tweet in tweets_cleaned:
    if tweet['tweet_id'] in unique_tweet_ids:
        print(f'Duplicate key: {tweet["tweet_id"]}')
        duplicates.append(tweet['tweet_id'])
        continue

    unique_tweet_ids.add(tweet['tweet_id'])

print(f'Original number of Tweets: {len(tweets_cleaned)}')
print(f'Number of unique TweetIDs: {len(unique_tweet_ids)}')
print(f'Number of duplicate TweetIDs: {len(duplicates)}')
print(f'Duplicate ratio: {len(duplicates) / len(tweets_cleaned) * 100}%')

'Original number of Tweets: 37'
'Number of unique TweetIDs: 37'
'Number of duplicate TweetIDs: 0'
'Duplicate ratio: 0.0%'


In [50]:
# word tokenization
tokenized_docs = [word_tokenize(tweet['text']) for tweet in tweets_cleaned]
print(tokenized_docs[0])

['RT',
 '@',
 'forty3north',
 ':',
 'And',
 'congratulations',
 'to',
 'our',
 'runners-up',
 '@',
 'CooltureUSA',
 'and',
 'Pneumaglide',
 '!',
 '#',
 'THEPITCH']


In [51]:
# sentence tokenization
sent_token = [sent_tokenize(tweet['text']) for tweet in tweets_cleaned]
print(sent_token[:3])

[['RT @forty3north: And congratulations to our runners-up @CooltureUSA and '
  'Pneumaglide!',
  '#THEPITCH'],
 ['RT @MarnieLaVigne2: @Launch_NY client @CooltureUSA wins $5k!',
  'https://t.co/Sx38wpPw7l'],
 ['RT @AshleyroweWKBW: Congratulations to @ZandraBeauty for winning top prize '
  'at #ThePitch @forty3north !',
  'A rising star.',
  'Our full story on Zan…']]


In [52]:
# lowercase
for tweet in tweets_cleaned:
    tweet['text'] = tweet['text'].lower()

In [53]:
# punctuation
punctuation_re = re.compile(f'[{re.escape(string.punctuation)}]')
tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = punctuation_re.sub('', token)
        if new_token != '':
            new_review.append(new_token)

    tokenized_docs_no_punctuation.append(new_review)

print(tokenized_docs_no_punctuation[0])

['RT',
 'forty3north',
 'And',
 'congratulations',
 'to',
 'our',
 'runnersup',
 'CooltureUSA',
 'and',
 'Pneumaglide',
 'THEPITCH']


In [54]:
# stopwords
tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if word not in stopwords.words('english'):
            new_term_vector.append(word)

    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords[0])

['RT',
 'forty3north',
 'And',
 'congratulations',
 'runnersup',
 'CooltureUSA',
 'Pneumaglide',
 'THEPITCH']


In [56]:
# stemming & lemmatization
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))

    preprocessed_docs.append(final_doc)

print(preprocessed_docs[:3])

AttributeError: 'WordNetLemmatizer' object has no attribute 'Lemmatize'