In [7]:
import string
import copy

import pandas as pd
import tweepy
import nltk
import re

import IPython
from IPython.display import clear_output

# Emoji library (for demojization)
import emoji
emojis = list(emoji.EMOJI_DATA.keys())

# Language Detection
import spacy
import spacy_fastlang # is used
from spacy_langdetect import LanguageDetector

# Stopwords to remove
from nltk.corpus import stopwords as sw
stopwords = sw.words('English')
stopwords.remove('not')

In [8]:
## Private keys & secrets to authorize Tweepy client
acc = open("access.txt", "r")

# consumer key & secret
api_key = acc.readline()
api_secret = acc.readline()

# access token/key & secret
access_key = acc.readline()
access_secret = acc.readline()

# bearer token
bearer = acc.readline()

In [9]:
def handle_emojis(lst):
    res = []
    for sentence in lst:
        clean = emoji.demojize(sentence)
        res.append(clean)
    return res
        

def clean_sentences(lst):
    ''' Cleans up a list of tweets.
    Removes: Links, tags, retweets, emojis'''
    res = []
    for sentence in lst:
        sentence = sentence.lower()
        sentence = emoji.demojize(sentence)
        
        words = sentence.split(' ')
        new_words = copy.deepcopy(words)
        
        for word in words:
            if len(word) == 0:
                new_words.remove(word)
            elif word[:4] == "http":
                new_words.remove(word)
            elif word[0] == "@":
                new_words.remove(word)
            elif word in stopwords:
                new_words.remove(word)
            elif word[:2] == "rt":
                new_words.remove(word)
            elif word[:2] == "\n":
                new_words.remove(word)
        
        sentence = " ".join(new_words)
        res.append(sentence)
    return res

def clean_words(lst):
    ''' Removes: punctuation and unwanted characters'''
    res = []
    for sentence in lst:
        words = sentence.split(' ')
        new_words = []
        
        for word in words:
            for char in word:
                if char in string.punctuation:
                    word.replace(char, "")
                if char == " ":
                    word.replace(char, "")
            new_words.append(word)
            
        sentence = " ".join(new_words)
        res.append(sentence) 
    return res
    
def remove_all_non_en(lst):
    ''' Removes all content that is NOT english from a list of tweets'''
    langs = []
    res = copy.deepcopy(lst)
    
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("language_detector")
    
    for item in lst:
        doc = nlp(item)
        lang = doc._.language
        if lang != 'en':
            res.remove(item)
        langs.append(lang)
    lang_labels['language'] = langs
    return res

In [10]:
client = tweepy.Client(
    bearer_token=bearer,
    access_token=access_key,
    access_token_secret=access_secret,
    consumer_key=api_key,
    consumer_secret=api_secret
)

In [11]:
GET_NEW_TWEETS = False

if GET_NEW_TWEETS:
    tweets = set()
    recent = client.search_recent_tweets(query="leclerc", max_results=100)
    
    for item in recent[0]:
        tweets.add(item.text)
    data = list(tweets)
    pd.Series(data).to_csv('raw.csv', index=False)
else:
    tweets = pd.read_csv('raw.csv')
    data = list(tweets['0'])

In [12]:
data = handle_emojis(data)
data = clean_words(data)
data = clean_sentences(data)
# TODO: dont forget stemming

lang_labels = pd.DataFrame(data, columns=['tweet content'])
data = remove_all_non_en(data)



In [13]:
pd.Series(data).to_csv('clean.csv', index=False)

In [14]:
# Driver / Team sentiment
# Final pipeline before actual NLP

# TODO: how to handle negation?

labels = []
data_copy = copy.deepcopy(data)
for sentence in data_copy:
    if sentence == "": # TODO: this should be handled elsewhere....
        data.remove(sentence)
        continue
    inputValid = False
    
    print("="*24)
    print("Label the sentiment of the following tweets about an f1 driver / team.")
    print("1 = negative sentiment\n2 = positive sentiment\n0 = neutral / no sentiment\nX = erase this tweet\nSTOP = stop program")
    print(sentence)
    print("= "*24)
    
    while (not inputValid):
        inp = input("Input: ")
        if inp in ["0", "1", "2"]:
            inputValid = True
            labels.append(int(inp))
        elif inp.lower() == "x":
            inputValid = True
            data.remove(sentence)
        elif inp == "STOP":
            inputValid = True
            IPython.sys.exit()
        else:
            continue
    clear_output()

Label the sentiment of the following tweets about an f1 driver / team.
1 = negative sentiment
2 = positive sentiment
0 = neutral / no sentiment
X = erase this tweet
STOP = stop program

= = = = = = = = = = = = = = = = = = = = = = = = 


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
final = pd.DataFrame(data=[data, labels]).transpose()
final.columns = ['tweet content', 'sentiment']