# Part II: Cleaning

This Jupyter Notebook cleans the tweets to prepare for the third phase.

## Importing Libraries

In [65]:
import string
import copy

import pandas as pd
import nltk
import re

import IPython
from IPython.display import clear_output

# Emoji library (for demojization)
import emoji
emojis = list(emoji.EMOJI_DATA.keys())

# Language Detection
import spacy
import spacy_fastlang # is used
from spacy_langdetect import LanguageDetector

# Stopwords to remove
from nltk.corpus import stopwords as sw
stopwords = sw.words('English')
stopwords.remove('not')

## Pipeline Functions

In [66]:
def handle_emojis(lst):
    res = []
    for sentence in lst:
        words = sentence.split(' ')
        new_words = []
        for word in words:
            for char in word:
                if char in emojis:
                    word.replace(char, "")
            new_words.append(word)
        sentence = ' '.join(new_words)
        res.append(sentence)
    return res
        

def clean_sentences(lst):
    ''' Cleans up a list of tweets.
    Removes: Links, tags, retweets, emojis'''
    res = []
    for sentence in lst:
        sentence = sentence.lower()     # lower-cases sentence
        
        words = sentence.split(' ')
        new_words = copy.deepcopy(words)
        
        for word in words: # Iterates through each word in the tweet
            if len(word) == 0:
                new_words.remove(word)
            elif word[:4] == "http":    # Removes links
                new_words.remove(word)
            elif word[0] == "@":        # Removes tags
                new_words.remove(word)
            elif word in stopwords:     # Removes stopwords
                new_words.remove(word)
            elif word[:2] == "rt":      # Removes retweets
                new_words.remove(word)
            elif word[:2] == "\n":      # Removes line breaks
                new_words.remove(word)
                                        
        sentence = " ".join(new_words)
        # sentence = re.sub(string.punctuation, '', sentence) # Removes punctuation
        for chr in string.punctuation:
            sentence = sentence.replace(chr, "")
        
        res.append(sentence)
    return res

def clean_words(lst):
    ''' Removes: empty spaces'''
    res = []
    for sentence in lst:
        words = sentence.split(' ')
        new_words = []
        
        for word in words:
            for char in word: # iterates through each character in the tweet
                if char == " ":
                    word.replace(char, "")
            new_words.append(word)
            
        sentence = " ".join(new_words)
        res.append(sentence) 
    return res
    
def remove_non_english(lst):
    ''' Removes all content that is NOT english from a list of tweets'''
    langs = []
    res = copy.deepcopy(lst)
    
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("language_detector")
    
    for item in lst: # iterates through each sentence
        doc = nlp(item)
        lang = doc._.language
        if lang != 'en':
            res.remove(item)        # removes non-english sentences
        langs.append(lang)
    lang_labels['language'] = langs # secondary side-effect
    return res

def stem(lst):
    '''Stems words to use basic stem word (e.g turn instead of turning)'''
    ps = nltk.stem.PorterStemmer()
    res = []
    for sentence in lst:
        new_sentence = []
        for word in sentence.split():
            new_word = ps.stem(word)
            new_sentence.append(new_word)
        new_sentence = " ".join(new_sentence)
        res.append(new_sentence)
    return res

## Cleaning the Data

In [67]:
f = open('raw copy.txt')
raw = f.readlines()

data = pd.Series(raw)

In [68]:
lang_labels = pd.DataFrame(data, columns=['tweet content'])

data = handle_emojis(data)
data = clean_sentences(data)
# data = clean_words(data) # no need
data = remove_non_english(data)
data = stem(data)



In [69]:
# pd.Series(data).to_csv('clean.csv', index=False)

## Preparing Training Data (labeling sentiment)

This requires user interaction to label whether a tweet is positive, negative or neutral.
<br> (and alternatively, if a tweet is simply gibberish and should be removed.)

In [70]:
# TODO: how to handle negation?

labels = []
data_copy = copy.deepcopy(data)
for sentence in data_copy:
    if sentence == "": # TODO: this should be handled elsewhere....
        data.remove(sentence)
        continue
    inputValid = False
    
    print("===== "*8)
    print(f"Label the sentiment of the following tweets about an f1 driver / team: leclerc")
    print("1 : negative sentiment\n2 : positive sentiment\n0 : neutral / no sentiment\nX : erase this tweet\nSTOP : stop program")
    print("\nTweet:")
    print(sentence)
    print("===== "*8)
    
    while (not inputValid):
        inp = input("Input: ")
        if inp in ["0", "1", "2"]:
            inputValid = True
            labels.append(int(inp))
        elif inp.lower() == "x":
            inputValid = True
            data.remove(sentence)
        elif inp == "STOP":
            inputValid = True
            IPython.sys.exit()
        else:
            continue
    clear_output()

SyntaxError: f-string: expecting '}' (1469499442.py, line 12)

## Exporting

In [None]:
final = pd.DataFrame(data=[data, labels]).transpose()
final.columns = ['tweet content', 'sentiment']

final.to_excel('out.xlsx', index=False)