# Create Emotion label

This notebook will create a label on the full "trumptweets.csv" data set. 
The text of Donald Trump´s tweets is going to be emotionally labeled. 
Therefore the code of the notebook "Label Test.ipynb" will be used. 
This notebook was been used to create a test set of data, which I used for some testing of visualization and classification. 


## Imports

In [1]:
# libraries Im going to use
import pandas as pd
from IPython.display import Image
import os
from langdetect import detect
import time
from ibm_watson import ApiException

## Functions 

In [2]:
### Emotino classification function
# function to classify text (tweet, one tweet)
def emotion_classification(Text):
    """
    This function runs the Watson Natural Language Understanding API
    
    Input:
    - Text, which is going to be classifed into specific emotion
    
    Output:
    - list
    -- most important - emotion of the document
    """
    response = natural_language_understanding.analyze(
        text= Text, # inpute of the function
        features=Features(
            emotion=EmotionOptions(document=True),
            entities=EntitiesOptions(emotion=True, sentiment=True, limit=2),
            keywords=KeywordsOptions(emotion=True, sentiment=True,
                                     limit=2))
    ).get_result()
    return(response)

In [3]:
### get max emotion
def get_max_emotion(emotions_dict):
    """
    This function runs the classification/emotion with it´s given probability
    
    Input:
    - emotions_dict
    
    Output:
    - emotion_max_probability
    -- just returns the emotion with the highest probability
    - max_probability
    -- the probability the emotion come along with
    """
    
    # find max value and label
    max_probability = max(emotions_dict.values())  

    # getting all emotions containing the "max_probability" as string
    emotion_max_probability = [k for k, v in emotions_dict.items() if v == max_probability][0] 
    
    # return
    return(emotion_max_probability,max_probability)


In [4]:
### drop rows with pictures and links at the beginning of the tweet
def drop_pictures_links(df):
    """
    This function does removes the tweets (whole row), which start with a picture or link.
    Reason: Often it´s just the picture or the link, this causes a problem by analyzing the "text"
    ALSO: check the language of the tweet, if it´s not englisch the tweet will be removed
    
    Input:
    - df: the data frame with all unprocessed data
    
    Output:
    - df: without the rows which start with a picture or link
    - drop_list: list of the row index which got dropped
    """
    # init list of all the rows which are going to be dropped
    drop_list = []
    
    # init values for the loop
    # check if tweets starts with picture or link
    start_picture_tweet = "pic.twitter.com"
    start_picture_tweet_length = len(start_picture_tweet)
    start_link_tweet = "https://"
    start_link_tweet_length = len(start_link_tweet)
    start_hashtag = "#"
    start_hashtag_length = len(start_hashtag)
    
    # run throguh every row and check which rows start with a picture or link
    for i in range(df.shape[0]):
        
        ## Flag for the tweet 1398
        maually = [284, # 284 = "USA! USA! USA!"
                   1398, # 1398 = "..."
                   306, # CONGRATULATIONS!
                   498, # Tremendous Spirit!
                   1382, # THANK YOU #IACP2019 !
                   1577, #Congratulations @StLouisBlues
                   1602, # America First!
                   1699, #Sleepy Joe Biden!
                   2047] #THANK YOU!
        if(i in maually): 
            drop_list.append(i)
            print(i, "manually")
            
        # check language
        #if detect(df.text[i][0:start_link_tweet_length]) != "en": 
        elif(detect(data.text[i]) == "ar" or detect(data.text[i]) == "he" or detect(data.text[i]) == "fa"):
            drop_list.append(i)
            print(i, "lang", detect(data.text[i]), data.text[i])
        # check picture
        if df.text[i][0:start_picture_tweet_length] == start_picture_tweet:
            drop_list.append(i)
            print(i, "pic")
            
        # check for link
        if df.text[i][0:start_link_tweet_length] == start_link_tweet:
            drop_list.append(i)
            print(i, "link")
        # check for #
        if df.text[i][0:start_hashtag_length] == start_hashtag:
            drop_list.append(i)
            print(i, "#")
        
        
     
    # drop rows which start with pic or link
    df_new = df.drop(drop_list)
    print(df_new)
    # return list of rows to drop
    return(df_new, drop_list)

### Remove links at the end of the tweet

In [5]:
def remove_link(tweet):
    # remove the link at the end of the text
    if "https://" in tweet:
        # setting the maxsplit parameter to 1, will return a list with 2 elements!
        # access just the first element
        new_string = tweet.split("https://",1)[0]
        return(new_string)
    else:
        return(tweet)
def remove_pic(tweet):
    if "pic.twitter." in tweet:
        # setting the maxsplit parameter to 1, will return a list with 2 elements!
        # access just the first element
        new_string = tweet.split("pic.twitter",1)[0]
        return(new_string)
    else:
        return(tweet)
def remove_hashtag(tweet):
    if "#" in tweet:
        # setting the maxsplit parameter to 1, will return a list with 2 elements!
        # access just the first element
        new_string = tweet.split("#",1)[0]
        return(new_string)
    else:
        return(tweet)

### Main function

In [6]:
def create_emotion(df):
    """
    The function does transform the data frame, so that just important information are going to be used. 
    It also adds the emotion and probability of the emotion as column.
    
    Input:
    - df: this is going to be the data frame as we get it from the GitHub function
    
    Output:
    - df_new: data frame we important information, plus enmotion and emotion probability
    """
    
    # tranformation of the data frame
    # data frame just with relevant informantion (username, date, retweets, favorites)
    trump_tweets = df.iloc[:,0:5]

    # drop rows of the data frame which start with picture or link
    trump_tweets, rowindex_dropped = drop_pictures_links(trump_tweets) # --- function
    # reindex data frame - easier to run the loop
    trump_tweets = trump_tweets.reset_index()
    # init emotion and its probabilites as list - save the results  - going to be return values
    emotions_list = []
    emotions_prob_list = []
    
    # loop over every sentence in the data frame
    ##########################################################################################
    # Here happens the magic
    ##########################################################################################
    # This loop will take very long for the whole data set
    for i in range(trump_tweets.shape[0]):
        
        # print after every 100 classification to see process
        if i in range(0,trump_tweets.shape[0],10):
            print(i, "emotional classifications done!")

        # run the function with Watson API 
        
        # clean the tweet, remove link, picture and hashtag at the end 
        clean_tweet = remove_pic(remove_link(remove_hashtag(trump_tweets.text[i])))
        
        # the main function - create emotions of the cleaned text
        emotions = emotion_classification(clean_tweet) # --- function

        ### --- flag - start ---
        # check if there does a emotion exist
        # else classify with "emotionless, 0,0%"
        if("emotion" in emotions.keys()):
        ### --- flag - end ---

            # access just the emotion of the document (of the tweet)
            emotions_dict = emotions["emotion"]["document"]["emotion"]

            # get the max emotion value
            # get_max_emotion(emotions_dict) 
            emotion_max_probability,max_probability = get_max_emotion(emotions_dict) # --- function 

            # save results for each step
            emotions_list.append(emotion_max_probability)
            emotions_prob_list.append(max_probability)

        # classify emotion with "emotionsless"
        else:
            emotions_list.append("emotionless")
            emotions_prob_list.append(0.0)
    ##########################################################################################       
    # assign emotions and its probabilites to data frame
    trump_tweets = trump_tweets.assign(emotion = emotions_list)
    trump_tweets = trump_tweets.assign(emotion_probability = emotions_prob_list)
    
    # return new data frame
    return(trump_tweets)

## Set up of the API

In [7]:
### Liberaries
# libraries I need to import
import json

# if import does not work
# pip install --upgrade "ibm-watson>=4.2.1"
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, EntitiesOptions, KeywordsOptions, EmotionOptions

In [8]:
### API set up
# I saved all my important data into a json file
# this file I open here to save the apikey and url in a variable
# since other people should not see my keys
with open('watson_api.json') as json_file:
    # save data in dict
    api_access = json.load(json_file)

# init variables neeeded 
apikey = api_access["apikey"]
url = api_access["url"]

In [9]:
### API settings
# settings for the api 
authenticator = IAMAuthenticator(apikey)
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator)

natural_language_understanding.set_service_url(url)

## The data

In [10]:
# load the tweets
data = pd.read_csv("trumptweets.csv", sep=';')

### Check the data


In [11]:
data.head()

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,realDonaldTrump,09.02.20 00:47,13459,72445,A great coach and a fantastic guy. His endorse...,,,,"1,22629E+18",https://twitter.com/realDonaldTrump/status/122...
1,realDonaldTrump,08.02.20 22:08,47880,215503,Pete Rose played Major League Baseball for 24 ...,,,,"1,22625E+18",https://twitter.com/realDonaldTrump/status/122...
2,realDonaldTrump,08.02.20 20:48,9452,37402,Total and complete Endorsement for Debbie Lesk...,,#NAME?,,"1,22623E+18",https://twitter.com/realDonaldTrump/status/122...
3,realDonaldTrump,08.02.20 20:40,17545,62484,Governor Cuomo wanted to see me this weekend. ...,,,,"1,22623E+18",https://twitter.com/realDonaldTrump/status/122...
4,realDonaldTrump,08.02.20 20:01,27437,120598,We will not be touching your Social Security o...,,,,"1,22622E+18",https://twitter.com/realDonaldTrump/status/122...


In [12]:
data.tail()

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
10245,realDonaldTrump,07.01.17 16:02,24681,87739,Having a good relationship with Russia is a go...,,,,"8,17748E+17",https://twitter.com/realDonaldTrump/status/817...
10246,realDonaldTrump,07.01.17 13:03,16601,73661,Only reason the hacking of the poorly defended...,,,,"8,17703E+17",https://twitter.com/realDonaldTrump/status/817...
10247,realDonaldTrump,07.01.17 12:56,15401,60280,Intelligence stated very strongly there was ab...,,,,"8,17701E+17",https://twitter.com/realDonaldTrump/status/817...
10248,realDonaldTrump,07.01.17 04:53,13961,59218,Gross negligence by the Democratic National Co...,,,,"8,1758E+17",https://twitter.com/realDonaldTrump/status/817...
10249,realDonaldTrump,07.01.17 01:07,6657,42476,Happy Birthday @EricTrump ! https://www. faceb...,,#NAME?,,"8,17523E+17",https://twitter.com/realDonaldTrump/status/817...


In [13]:
data.shape

(10250, 10)

### Test data

one more small test before I run the whole data set!

In [14]:
# one more small test 
# create a test set of 200 tweets
data_test = data.iloc[0:400,:]
data_test

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,realDonaldTrump,09.02.20 00:47,13459,72445,A great coach and a fantastic guy. His endorse...,,,,"1,22629E+18",https://twitter.com/realDonaldTrump/status/122...
1,realDonaldTrump,08.02.20 22:08,47880,215503,Pete Rose played Major League Baseball for 24 ...,,,,"1,22625E+18",https://twitter.com/realDonaldTrump/status/122...
2,realDonaldTrump,08.02.20 20:48,9452,37402,Total and complete Endorsement for Debbie Lesk...,,#NAME?,,"1,22623E+18",https://twitter.com/realDonaldTrump/status/122...
3,realDonaldTrump,08.02.20 20:40,17545,62484,Governor Cuomo wanted to see me this weekend. ...,,,,"1,22623E+18",https://twitter.com/realDonaldTrump/status/122...
4,realDonaldTrump,08.02.20 20:01,27437,120598,We will not be touching your Social Security o...,,,,"1,22622E+18",https://twitter.com/realDonaldTrump/status/122...
...,...,...,...,...,...,...,...,...,...,...
395,realDonaldTrump,11.01.20 00:32,15163,90211,Will be interviewed tonight by Laura @Ingraham...,,#NAME?,,"1,21578E+18",https://twitter.com/realDonaldTrump/status/121...
396,realDonaldTrump,10.01.20 16:37,24488,120462,She will go down as perhaps the least successf...,,,,"1,21566E+18",https://twitter.com/realDonaldTrump/status/121...
397,realDonaldTrump,10.01.20 16:31,18077,78224,I love constantly proving them wrong. It’s eas...,,,,"1,21566E+18",https://twitter.com/realDonaldTrump/status/121...
398,realDonaldTrump,10.01.20 16:11,16815,86594,“I’ve been doing this for 40 years and I’ve ne...,,#NAME?,,"1,21565E+18",https://twitter.com/realDonaldTrump/status/121...


# Check function *create_emotion(df)*

In [52]:
test_text = data_test["text"][280:310]
test_text

280    I will NEVER allow our great Second Amendment ...
281    It was exactly three years ago today, January ...
282    ...And they say you can add 7% to 10% to all T...
283    Two stone cold losers from Amazon WP. Almost e...
284                                       USA! USA! USA!
285    Cryin’ Chuck Schumer is now asking for “fairne...
286    They didn’t want John Bolton and others in the...
287    The Democrat Party in the Great Commonwealth o...
288    A great show! Check it out tonight at 9pm. @Fo...
289    “In the House, the President got less due proc...
290    I was thrilled to be back in the Great State o...
291    Now Mini Mike Bloomberg is critical of Jack Wi...
292    I have never seen the Republican Party as Stro...
293    I will be going to Austin, Texas. Leaving soon...
294                           pic.twitter.com/Efot9QLGDj
295    “Nancy Pelosi said, it’s not a question of pro...
296    If you listened to the flawed advice of @paulk...
297                     ....BUT

In [62]:
fail_index = []
fail_text = []
for i in test_text.index:
    try:
        print(i)
        emotion_classification(test_text[i])
    except ApiException as ex:
        print(i)
        print(test_text[i])
        fail_index.append(i)
        fail_text.append(test_text[i])
        #print ("Method failed with status code " + str(ex.code) + ": " + ex.message)

280
281
282
283
284


ERROR:root:not enough text for language id
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: 51395ac2-8963-4ed8-a23b-0c8775aae223


284
USA! USA! USA!
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306


ERROR:root:unsupported text language: unknown
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: unsupported text language: unknown, Code: 400 , X-global-transaction-id: b731dae5-df86-498a-a409-1c907c0da6a0


306
CONGRATULATIONS! #GeauxTigers pic.twitter.com/Axc9ezAdar
307


ERROR:root:unsupported text language: fa
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: unsupported text language: fa, Code: 400 , X-global-transaction-id: 028867f7-ac5a-4745-b3b5-503d020d995a


307
مردم نجیب ایران، که آمریکا را دوست می دارند، سزاوار دولتی هستند که بیش از تمرکز بر کشتن آنها به جرم احترام خواهی، به آنها کمک کند تا به رؤیاهایشان دست یابند. رهبران ایران به جای آن که ایران را به سمت ویرانی بکشانند، باید هراس افکنی را کنار بنهند و ایران را دوباره باعظمت کنند! https:// twitter.com/khamenei_ir/st atus/1218141834842660864 …
308
309


# Check Tweets - picture, link and non-english

**Links** - just the beginning

In [15]:
def check_tweets_function(data):
    # init values for the loop
    # check if tweets starts with picture or link
    start_picture_tweet = "pic.twitter.com"
    start_picture_tweet_length = len(start_picture_tweet)
    start_link_tweet = "https://"
    start_link_tweet_length = len(start_link_tweet)

    # init variables for picture and link case
    index_pictures = []
    index_link = []

    # init variables for the non-english case
    amount_english_tweets = 0
    amount_non_english_tweets = 0
    index_non_english_tweets = []

    for i in range(len(data.text)):
        if(i == 1398): # very ugly way, don´t case
            i = i +1
        # check picture
        if data.text[i][0:start_picture_tweet_length] == start_picture_tweet:
            index_pictures.append(i)
        # check for link
        if data.text[i][0:start_link_tweet_length] == start_link_tweet:
            index_link.append(i)
        # remove if case tweet is Arabic, Hebrew or Persian
        if(detect(data.text[i]) == "ar" or detect(data.text[i]) == "he" or detect(data.text[i]) == "fa"): 
            index_non_english_tweets.append(i)
    
    # show results
    print("Amount pictures:", len(index_pictures))
    print("Amount links:", len(index_link))
    print("Non-english:", len(index_non_english_tweets))
    # results
    return({"index_picture": index_pictures,
           "index_links": index_link,
           "index_non_english":index_non_english_tweets})

In [17]:
check_tweet = check_tweets_function(data_test)

Amount pictures: 26
Amount links: 7
Non-english: 8


In [18]:
print("Totel items:", sum(map(len, check_tweet.values())))

Totel items: 41


In [19]:
#for i in index_pictures:
#    print(i,data.text[i])  
#for i in index_link:
#    print(i, data.text[i])
for i in index_non_english_tweets:
    print(i, detect(data.text[i]), data.text[i])

NameError: name 'index_non_english_tweets' is not defined

# Check Tweets test - picture, link and non-english

**Links** - just the beginning

In [536]:
check_tweet_test = check_tweets_function(data_test)

Amount pictures: 26
Amount links: 7
Non-english: 8


In [537]:
print("Totel items:", sum(map(len, check_tweet_test.values())))

Totel items: 41


# Make emotional classification - Test set

In [538]:
# create data frame 
start = time.time()
data_emotion = create_emotion(data_test)
runtime_test = time.time()
print(runtime_test)

13 pic
23 pic
34 pic
38 link
41 pic
43 pic
44 pic
48 #
49 pic
58 pic
105 pic
106 pic
107 pic
108 pic
111 pic
112 pic
117 pic
139 link
142 lang ar هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعاصمة في أجزاء من القدس الشرقية. pic.twitter.com/CFuYwwjSso
144 lang he תמיד אעמוד לצד מדינת ישראל והעם היהודי. אני תומך בחוזקה בבטיחותם ובטחונם ובזכותם לחיות במולדתם ההיסטורית. הגיע הזמן לשלום! pic.twitter.com/AtNnQtnGZs
145 link
156 link
157 pic
167 pic
187 pic
189 lang fa وزیر امور خارجه ایران می گوید ایران خواستار مذاکره با ایالات متحده است اما می خواهد که تحریم ها برداشته شود. @FoxNews @OANN نه، مرسی! https:// twitter.com/realDonaldTrum p/status/1221225245220265985 …
205 link
208 #
222 pic
223 pic
225 pic
278 link
284 manually
294 pic
307 lang fa مردم نجیب ایران، که آمریکا را دوست می دارند، سزاوار دولتی هستند که بیش از تمرکز بر کشتن آنها به جرم احترام خواهی، به آنها کمک کند تا به رؤیاهایشان دست یابند. رهبران ایران به جای آن که ایران را به سمت ویرانی بکشانند، باید هراس افکنی را کنار بنهند و ایرا

ERROR:root:unsupported text language: unknown
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: unsupported text language: unknown, Code: 400 , X-global-transaction-id: 06f3f0f367cb04a4fb06f47b48493add


ApiException: Error: unsupported text language: unknown, Code: 400 , X-global-transaction-id: 06f3f0f367cb04a4fb06f47b48493add

In [505]:
data_emotion

Unnamed: 0,index,username,date,retweets,favorites,text,emotion,emotion_probability
0,0,realDonaldTrump,09.02.20 00:47,13459,72445,A great coach and a fantastic guy. His endorse...,joy,0.601324
1,1,realDonaldTrump,08.02.20 22:08,47880,215503,Pete Rose played Major League Baseball for 24 ...,sadness,0.320811
2,2,realDonaldTrump,08.02.20 20:48,9452,37402,Total and complete Endorsement for Debbie Lesk...,joy,0.453622
3,3,realDonaldTrump,08.02.20 20:40,17545,62484,Governor Cuomo wanted to see me this weekend. ...,sadness,0.462204
4,4,realDonaldTrump,08.02.20 20:01,27437,120598,We will not be touching your Social Security o...,anger,0.648076
...,...,...,...,...,...,...,...,...
261,295,realDonaldTrump,19.01.20 03:12,38035,163633,"“Nancy Pelosi said, it’s not a question of pro...",disgust,0.699125
262,296,realDonaldTrump,19.01.20 01:52,18457,77993,If you listened to the flawed advice of @paulk...,sadness,0.563830
263,297,realDonaldTrump,19.01.20 01:52,37245,191859,....BUT THE BEST IS YET TO COME!,joy,0.730271
264,298,realDonaldTrump,19.01.20 00:18,26120,128836,"A massive 200 Billion Dollar Sea Wall, built a...",sadness,0.524536


In [506]:
data_emotion.shape

(266, 8)

In [507]:
for i in range(data_emotion.shape[0]):
    print("---")
    print(remove_pic(remove_link(data_emotion.text[i])))

---
A great coach and a fantastic guy. His endorsement of me in Indiana was a very big deal! 
---
Pete Rose played Major League Baseball for 24 seasons, from 1963-1986, and had more hits, 4,256, than any other player (by a wide margin). He gambled, but only on his own team winning, and paid a decades long price. GET PETE ROSE INTO THE BASEBALL HALL OF FAME. It’s Time!
---
Total and complete Endorsement for Debbie Lesko! @RepDLesko Love Arizona. 
---
Governor Cuomo wanted to see me this weekend. He just canceled. Very hard to work with New York - So stupid. All they do is sue me all the time! 
---
We will not be touching your Social Security or Medicare in Fiscal 2021 Budget. Only the Democrats will destroy them by destroying our Country’s greatest ever Economy!
---
...the worst weeks ever.” She could have had a much better week if Crazy Nancy, who is the most overrated person in politics (going to lose the House a second time), didn’t bring the phony & corrupt Impeachment Hoax. Dems ca

In [508]:
test_test = data_emotion

In [509]:
for i in range(data_emotion.shape[0]):
    test_test.at[i , "text"] = remove_pic(remove_link(data_emotion.text[i]))

In [510]:
test_test

Unnamed: 0,index,username,date,retweets,favorites,text,emotion,emotion_probability
0,0,realDonaldTrump,09.02.20 00:47,13459,72445,A great coach and a fantastic guy. His endorse...,joy,0.601324
1,1,realDonaldTrump,08.02.20 22:08,47880,215503,Pete Rose played Major League Baseball for 24 ...,sadness,0.320811
2,2,realDonaldTrump,08.02.20 20:48,9452,37402,Total and complete Endorsement for Debbie Lesk...,joy,0.453622
3,3,realDonaldTrump,08.02.20 20:40,17545,62484,Governor Cuomo wanted to see me this weekend. ...,sadness,0.462204
4,4,realDonaldTrump,08.02.20 20:01,27437,120598,We will not be touching your Social Security o...,anger,0.648076
...,...,...,...,...,...,...,...,...
261,295,realDonaldTrump,19.01.20 03:12,38035,163633,"“Nancy Pelosi said, it’s not a question of pro...",disgust,0.699125
262,296,realDonaldTrump,19.01.20 01:52,18457,77993,If you listened to the flawed advice of @paulk...,sadness,0.563830
263,297,realDonaldTrump,19.01.20 01:52,37245,191859,....BUT THE BEST IS YET TO COME!,joy,0.730271
264,298,realDonaldTrump,19.01.20 00:18,26120,128836,"A massive 200 Billion Dollar Sea Wall, built a...",sadness,0.524536


# More tests

Here I wi
ll go and look through every tweet, and check manually if this will occure an error!

In [659]:
test_df = data.iloc[0:3000,:]
# drop rows of the data frame which start with picture or link
trump_tweets_test, rowindex_dropped_test = drop_pictures_links(test_df) # --- function
# reindex data frame - easier to run the loop
trump_tweets_test = trump_tweets_test.reset_index()

13 pic
23 pic
34 pic
38 link
41 pic
43 pic
44 pic
48 #
49 pic
58 pic
105 pic
106 pic
107 pic
108 pic
111 pic
112 pic
117 pic
139 link
142 lang ar هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعاصمة في أجزاء من القدس الشرقية. pic.twitter.com/CFuYwwjSso
144 lang he תמיד אעמוד לצד מדינת ישראל והעם היהודי. אני תומך בחוזקה בבטיחותם ובטחונם ובזכותם לחיות במולדתם ההיסטורית. הגיע הזמן לשלום! pic.twitter.com/AtNnQtnGZs
145 link
156 link
157 pic
167 pic
187 pic
189 lang fa وزیر امور خارجه ایران می گوید ایران خواستار مذاکره با ایالات متحده است اما می خواهد که تحریم ها برداشته شود. @FoxNews @OANN نه، مرسی! https:// twitter.com/realDonaldTrum p/status/1221225245220265985 …
205 link
208 #
222 pic
223 pic
225 pic
278 link
284 manually
294 pic
306 manually
307 lang fa مردم نجیب ایران، که آمریکا را دوست می دارند، سزاوار دولتی هستند که بیش از تمرکز بر کشتن آنها به جرم احترام خواهی، به آنها کمک کند تا به رؤیاهایشان دست یابند. رهبران ایران به جای آن که ایران را به سمت ویرانی بکشانند، باید هراس افکنی را کنار

In [660]:
for i in range(1840,2986):
    print("---")
    clean_tweet = remove_pic(remove_link(remove_hashtag(trump_tweets_test.text[i])))
    print(i, clean_tweet)
    emotion_classification(clean_tweet)

---
1840 They never even saw the transcript of the call. A total Witch Hunt!
---
1841 Pelosi, Nadler, Schiff and, of course, Maxine Waters! Can you believe this?
---
1842 Such an important day at the United Nations, so much work and so much success, and the Democrats purposely had to ruin and demean it with more breaking news Witch Hunt garbage. So bad for our Country!
---
1843 The Democrats are so focused on hurting the Republican Party and the President that they are unable to get anything done because of it, including legislation on gun safety, lowering of prescription drug prices, infrastructure, etc. So bad for our Country!
---
1844 ....You will see it was a very friendly and totally appropriate call. No pressure and, unlike Joe Biden and his son, NO quid pro quo! This is nothing more than a continuation of the Greatest and most Destructive Witch Hunt of all time!
---
1845 I am currently at the United Nations representing our Country, but have authorized the release tomorrow of th

---
1885 Oh no, really big political news, perhaps the biggest story in years! Part time Mayor of New York City, @BilldeBlasio , who was polling at a solid ZERO but had tremendous room for growth, has shocking dropped out of the Presidential race. NYC is devastated, he’s coming home!
---
1886 “The U.S. Economy is the envy of the world, as Europe and Asia slide ever toward recession. But the Left is trying to avoid talking about the Trump Economy.” @IngrahamAngle The Best Is Yet To Come.
---
1887 Great news. @MariaBartiromo just renewed her deal with Fox. I don’t care how much they paid her, they got a beautiful bargain. Congratulations to both!
---
1888 Nice meeting with Mark Zuckerberg of @Facebook in the Oval Office today. 
---
1889 Because of my Administration, drug prices are down for the first time in almost 50 years — but the American people need Congress to help. I like Sen. Grassley’s drug pricing bill very much, and it’s great to see Speaker Pelosi’s bill today. Let’s get it d

ERROR:root:not enough text for language id
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: 43cf1c72fa0f3989befcbc671f1fa657


ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: 43cf1c72fa0f3989befcbc671f1fa657

In [661]:
trump_tweets_test.text[1895]

'Thank you! pic.twitter.com/hScbURTzMJ'

In [657]:
emotion_classification("thank you")

ERROR:root:not enough text for language id
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: bad7baef68472df6d9921367c5a913bb


ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: bad7baef68472df6d9921367c5a913bb

In [663]:
for i in range(0,10000):
    if (data.text[i] == trump_tweets_test.text[1895]):
        print(i, data.text[i])

2111 Thank you! pic.twitter.com/hScbURTzMJ


In [655]:
for i in range(2040,2050):
    print("--")
    print(i, data.text[i])

--
2040 Secretary of State Pompeo recieved permission from Ukraine Government to release the transcript of the telephone call I had with their President. They don’t know either what the big deal is. A total Witch Hunt Scam by the Democrats!
--
2041 pic.twitter.com/1KOSnHguW2
--
2042 PRESIDENTIAL HARASSMENT!
--
2043 They never even saw the transcript of the call. A total Witch Hunt!
--
2044 Pelosi, Nadler, Schiff and, of course, Maxine Waters! Can you believe this?
--
2045 Such an important day at the United Nations, so much work and so much success, and the Democrats purposely had to ruin and demean it with more breaking news Witch Hunt garbage. So bad for our Country!
--
2046 The Democrats are so focused on hurting the Republican Party and the President that they are unable to get anything done because of it, including legislation on gun safety, lowering of prescription drug prices, infrastructure, etc. So bad for our Country!
--
2047 THANK YOU! pic.twitter.com/Ne2LOSAWpX
--
2048 ....

# Make emotional classification - Full data set

### Check how long the cell runs to make all classifications

In [567]:
# Make emotional classification
# create data frame 
start = time.time()
new_data = create_emotion(data)
end = time.time()
runtime = round((end - start),2)

13 pic
23 pic
34 pic
38 link
41 pic
43 pic
44 pic
48 #
49 pic
58 pic
105 pic
106 pic
107 pic
108 pic
111 pic
112 pic
117 pic
139 link
142 lang ar هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعاصمة في أجزاء من القدس الشرقية. pic.twitter.com/CFuYwwjSso
144 lang he תמיד אעמוד לצד מדינת ישראל והעם היהודי. אני תומך בחוזקה בבטיחותם ובטחונם ובזכותם לחיות במולדתם ההיסטורית. הגיע הזמן לשלום! pic.twitter.com/AtNnQtnGZs
145 link
156 link
157 pic
167 pic
187 pic
189 lang fa وزیر امور خارجه ایران می گوید ایران خواستار مذاکره با ایالات متحده است اما می خواهد که تحریم ها برداشته شود. @FoxNews @OANN نه، مرسی! https:// twitter.com/realDonaldTrum p/status/1221225245220265985 …
205 link
208 #
222 pic
223 pic
225 pic
278 link
284 manually
294 pic
306 manually
307 lang fa مردم نجیب ایران، که آمریکا را دوست می دارند، سزاوار دولتی هستند که بیش از تمرکز بر کشتن آنها به جرم احترام خواهی، به آنها کمک کند تا به رؤیاهایشان دست یابند. رهبران ایران به جای آن که ایران را به سمت ویرانی بکشانند، باید هراس افکنی را کنار

ERROR:root:unsupported text language: unknown
Traceback (most recent call last):
  File "/Users/phillipholscher/opt/anaconda3/lib/python3.7/site-packages/ibm_cloud_sdk_core/base_service.py", line 229, in send
    response.status_code, error_message, http_response=response)
ibm_cloud_sdk_core.api_exception.ApiException: Error: unsupported text language: unknown, Code: 400 , X-global-transaction-id: 93784257f554a0f181ea2ccb32fda650


ApiException: Error: unsupported text language: unknown, Code: 400 , X-global-transaction-id: 93784257f554a0f181ea2ccb32fda650

## Test data 2 

In [450]:
# one more small test 
# create a test set of 200 tweets
data_test2 = data.iloc[0:1400,:]
data_test2.head(3)

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,realDonaldTrump,09.02.20 00:47,13459,72445,A great coach and a fantastic guy. His endorse...,,,,"1,22629E+18",https://twitter.com/realDonaldTrump/status/122...
1,realDonaldTrump,08.02.20 22:08,47880,215503,Pete Rose played Major League Baseball for 24 ...,,,,"1,22625E+18",https://twitter.com/realDonaldTrump/status/122...
2,realDonaldTrump,08.02.20 20:48,9452,37402,Total and complete Endorsement for Debbie Lesk...,,#NAME?,,"1,22623E+18",https://twitter.com/realDonaldTrump/status/122...


In [451]:
check_tweet_test2 = check_tweets_function(data_test2)
print("Totel items:", sum(map(len, check_tweet_test2.values())))

Amount pictures: 92
Amount links: 19
Non-english: 8
Totel items: 119




## Problem - Language
He post stuff in:
- Arabic
- Hebräisch

In [18]:
data_test.iloc[142,:]

username                                       realDonaldTrump
date                                            28.01.20 19:26
retweets                                                 11161
favorites                                                43754
text         هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعا...
geo                                                        NaN
mentions                                                   NaN
hashtags                                                   NaN
id                                                 1,22222E+18
permalink    https://twitter.com/realDonaldTrump/status/122...
Name: 142, dtype: object

## Solution - Problem - Language

### langdetect API

https://pypi.org/project/langdetect/

In [22]:
# import
from langdetect import detect

In [30]:
language_test_string1 = data_test.iloc[142,:].text
language_test_string2 = data_test.iloc[144,:].text
print(language_test_string1)
print(detect(language_test_string1))
print(language_test_string2)
print(detect(language_test_string2))

if detect(language_test_string1) != "en":
    print("Problem solved")

هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعاصمة في أجزاء من القدس الشرقية. pic.twitter.com/CFuYwwjSso
ar
תמיד אעמוד לצד מדינת ישראל והעם היהודי. אני תומך בחוזקה בבטיחותם ובטחונם ובזכותם לחיות במולדתם ההיסטורית. הגיע הזמן לשלום! pic.twitter.com/AtNnQtnGZs
he
Problem solved


### Amount of not english tweets


In [67]:
amount_english_tweets = 0
amount_non_english_tweets = 0
index_non_english_tweets = []

for index in range(len(data_test.text)):
    if(detect(data_test.text[index]) != "en"):
        amount_non_english_tweets = amount_non_english_tweets +1
        index_non_english_tweets.append(index)
    else:
        amount_english_tweets = amount_english_tweets +1 

English: 194
Other: 6


In [75]:
print("English:", amount_english_tweets)
print("Other:", amount_non_english_tweets)
print("Index other:", index_non_english_tweets)
for i in index_non_english_tweets:
    print(i, detect(data_test.text[i]), data_test.text[i])
    print("---")

English: 194
Other: 6
Index other: [60, 61, 142, 144, 145, 189]
60 de KEEP AMERICA GREAT!
---
61 so MAKE AMERICA GREAT AGAIN!
---
142 ar هذا ما قد تبدو عليه دولة فلسطين المستقبلية بعاصمة في أجزاء من القدس الشرقية. pic.twitter.com/CFuYwwjSso
---
144 he תמיד אעמוד לצד מדינת ישראל והעם היהודי. אני תומך בחוזקה בבטיחותם ובטחונם ובזכותם לחיות במולדתם ההיסטורית. הגיע הזמן לשלום! pic.twitter.com/AtNnQtnGZs
---
145 pl https://www. pscp.tv/w/cQBvvDFvTlFs TFJub1dwUXd8MW1ueGVReUVyVk54WA1zKet_7z8vzWFt8Te_Oj1SHWia5RzDd-o1zJWDHKC7 …
---
189 fa وزیر امور خارجه ایران می گوید ایران خواستار مذاکره با ایالات متحده است اما می خواهد که تحریم ها برداشته شود. @FoxNews @OANN نه، مرسی! https:// twitter.com/realDonaldTrum p/status/1221225245220265985 …
---


In [81]:
detect("make america grade again")

'pt'

## Preprocessing


Clean the text data

- Languages other than englisch
- Links 
-- at the *beginning* of the tweet *or* at the *end*
-- removed at the beginning
- Pictures

Don´t clean

- Hashtags
- tokenizing
- stop words
- lemma
- non-alphabetical characters

In [28]:
text = data.text

In [29]:
text[0]

'A great coach and a fantastic guy. His endorsement of me in Indiana was a very big deal! https:// twitter.com/kyle__boone/st atus/1226234981808250880 …'

In [20]:
for text in data_test.text:
    print(text)
    print("--")

A great coach and a fantastic guy. His endorsement of me in Indiana was a very big deal! https:// twitter.com/kyle__boone/st atus/1226234981808250880 …
--
Pete Rose played Major League Baseball for 24 seasons, from 1963-1986, and had more hits, 4,256, than any other player (by a wide margin). He gambled, but only on his own team winning, and paid a decades long price. GET PETE ROSE INTO THE BASEBALL HALL OF FAME. It’s Time!
--
Total and complete Endorsement for Debbie Lesko! @RepDLesko Love Arizona. https:// twitter.com/repdlesko/stat us/1225484090754899969 …
--
Governor Cuomo wanted to see me this weekend. He just canceled. Very hard to work with New York - So stupid. All they do is sue me all the time! https:// twitter.com/RepStefanik/st atus/1225494053913079808 …
--
We will not be touching your Social Security or Medicare in Fiscal 2021 Budget. Only the Democrats will destroy them by destroying our Country’s greatest ever Economy!
--
...the worst weeks ever.” She could have had a mu

In [21]:
# pre processing
# remove:
# - pictures (pic.twitter.com/KSVkKL76NM) - all (beginning and end)
# - link (https://...) - all  (beginning and end)
# - not english tweets (Arabic & Hebräisch)

In [22]:
import spacy

In [23]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "entityrecognizer"])

In [None]:
def preprocess(text):
    doc = nlp(text)
    
    # init an list to save the return
    result = []

    # is alpha: 
    # - Is the token an alpha character?
    
    for token in doc:
        # take just token without stop words & just alpha character 
        if token.is_stop == False and token.is_alpha == True: 
                result.append(token.lemma_)
    return result

In [38]:
# import library which is for cleaning 
import preprocessor as p
import re

In [54]:
text[2]

'Total and complete Endorsement for Debbie Lesko! @RepDLesko Love Arizona. https:// twitter.com/repdlesko/stat us/1225484090754899969 …'

In [55]:
clean_test = p.clean(text[2])
clean_test

'Total and complete Endorsement for Debbie Lesko! Love Arizona. https:// us/1225484090754899969'

In [56]:
clean_test2 = p.parse(text[2])
clean_test2.urls

[(83:109) => twitter.com/repdlesko/stat]

In [59]:
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [60]:
re.sub(TEXT_CLEANING_RE, ' ', str(text[2]).lower())

'total and complete endorsement for debbie lesko repdlesko love arizona   twitter com repdlesko stat us 1225484090754899969 '

In [None]:
regex_str = [
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
]

In [49]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [46]:
str(text[0])

'A great coach and a fantastic guy. His endorsement of me in Indiana was a very big deal! https:// twitter.com/kyle__boone/st atus/1226234981808250880 …'

In [48]:
test = re.sub('https?://\S+|www\.\S+', '', text[0])
print(test)

A great coach and a fantastic guy. His endorsement of me in Indiana was a very big deal! https:// twitter.com/kyle__boone/st atus/1226234981808250880 …


In [50]:
clean_text(text[0])

NameError: name 'string' is not defined

## Make emotional classification

In [None]:
# create data frame 
data_new = create_emotion(data)