In [79]:
import pandas as pd
import spacy
import en_core_web_sm # Language model to be used for nlp()
nlp = en_core_web_sm.load()

In [80]:
fox = pd.read_csv("fox_followers.csv", encoding = "unicode_escape")
cnn = pd.read_csv("cnn_followers.csv", encoding = "unicode_escape")

In [90]:
fox.head()

Unnamed: 0.1,Unnamed: 0,user_id,description,place_full_name,country,friends_count,account_lang,lang,created_at
0,1,948997266,Iowa State University,,,500,en,en,2019-02-01 10:58:26
1,2,2762605662,Long live Britain and America and all the true...,,,795,en,en,2019-01-31 21:29:32
2,3,1074363473080000512,"Intelligent, patients,loyal ,kind ,love ,compa...",,,380,en,en,2019-02-01 09:49:38
3,4,560692070,#LongLiveBigBen <U+0001F64F><U+0001F3FF>,,,842,en,en,2019-01-23 04:23:34
4,5,1088956155500490753,IM A SURVIVOR AKA A WALKING MIRACLE AKA A PHEN...,,,974,en,en,2019-02-01 09:57:34


In [89]:
cnn.head()

Unnamed: 0.1,Unnamed: 0,user_id,description,place_full_name,country,friends_count,account_lang,lang,created_at
0,1,1089863600246849536,optimistic,,,341,en,en,2019-01-31 10:34:58
1,2,791969492427341824,fav YouTuber: @moesargi\r\nfav music artist: @...,,,730,en,en,2019-01-31 10:39:38
2,3,957862952261926913,Follower of Living god @donnypangilinan and th...,,,634,en,en,2019-01-31 10:24:58
3,4,816924548796792833,I love to help anyone that's Serious about Mak...,,,4790,en,en,2019-01-27 18:54:23
4,5,1052194171795243008,praising and worshiping my God makes me happy,,,258,en,en,2019-01-22 10:17:57


In [92]:
def return_NER(df):
    
    """Takes in a pandas dataframe formatted as expected: 
    
       ['Unnamed: 0', 'user_id', 'description', 'place_full_name', 'country',
           'friends_count', 'account_lang', 'lang', 'created_at']
        
        Uses the spacy library to perform Named Entity Recognition (amongst other text analysis).
        
        Stores and outputs the results for each user in a new dataframe, which contains either a single value 
        or a set of values for each Named Entity.
       """
    
    
    df_result = pd.DataFrame(columns=["user_id", "Description", "NER", 
                                        "GPE", "LOC", "ORG", "NORP", "WORK_OF_ART", "Other"],
                              index = range(len(df)))

    for i in range(len(df)):
        df_result.iloc[i, 0] = df.iloc[i, 1] # Transfering user_id to the result df
        df_result.iloc[i, 1] = df.iloc[i, 2] # Transfer the description

        NER = []
        GPE = set() # set () avoids duplicates
        LOC = set()
        ORG = set()
        NORP = set()
        WORK_OF_ART = set()
        other = set()

        text = str(df.iloc[i, 2]) # Description
        
        doc = nlp(text) # One single call to spacy, returns the whole text analysis

        for ent in doc.ents: # doc.ents stores everything related to NER
            
            NER.append((ent.text, ent.label_)) # Raw NER
            
            # Filters the entities based on type
            if ent.label_ == "GPE": # Country, cities, states
                GPE.add(ent.text)
            elif ent.label_ == "LOC": # Locations others than GPEs 
                LOC.add(ent.text)
            elif ent.label_ == "ORG": # Organizations
                ORG.add(ent.text)
            elif ent.label_ == "NORP": # Nationalities, religious or political groups
                NORP.add(ent.text)
            elif ent.label_ == "WORK_OF_ART": # Titles of books, songs, etc. (can help determining hobbies)
                WORK_OF_ART.add(ent.text)            
            else:
                other.add((ent.text, ent.label_))
        
        
        results = [NER, GPE, LOC, ORG, NORP, WORK_OF_ART, other]
        
        # Loops over the sets to incorporate them into the final dataframe
        for j in range(len(results)):
            if len(results[j]) == 0:
                pass
            elif len(results[j]) == 1:
                # .pop method works both on lists & sets
                df_result.iloc[i, j+2] = results[j].pop() #j+2 = column index (2 first columns are id & raw text)
            else:
                df_result.iloc[i, j+2] = results[j] # If several results for one tag, stores them as a set in the df
    return df_result

In [91]:
NER_fox = return_NER(fox)
NER_fox.to_csv("NER_fox.csv")

In [93]:
NER_cnn = return_NER(cnn)
NER_cnn.to_csv("NER_cnn.csv")

In [94]:
NER_cnn

Unnamed: 0,user_id,Description,NER,GPE,LOC,ORG,NORP,WORK_OF_ART,Other
0,1089863600246849536,optimistic,,,,,,,
1,791969492427341824,fav YouTuber: @moesargi\r\nfav music artist: @...,"[(YouTuber, PERSON), (@5sos, ORG), (@sarah J.,...",,,@5sos,,,"{(@sarah J., PERSON), (YouTuber, PERSON)}"
2,957862952261926913,Follower of Living god @donnypangilinan and th...,"[(@donnypangilinan, GPE), (@KissesDelavin, PER...",@donnypangilinan,,,,,"{(DonKiss, MONEY), (@KissesDelavin, PERSON)}"
3,816924548796792833,I love to help anyone that's Serious about Mak...,"[(Serious, GPE), (#socialmedia #, MONEY)]",Serious,,,,,"(#socialmedia #, MONEY)"
4,1052194171795243008,praising and worshiping my God makes me happy,,,,,,,
5,1054963261098078209,Dnnmndnjnj<U+0923><U+0927><U+0927><U+0927><U+0...,"(Dnnmndnjnj, ORG)",,,Dnnmndnjnj,,,
6,420585058,"Everton following, movie watching, book loving...","(Everton, ORG)",,,Everton,,,
7,3133777988,utes 22 <U+0392>T<U+03A0>,,,,,,,
8,1046770645823033344,A new and permanent fixture on the professiona...,"[(the 9th - 11th August 2019, DATE), (dc@zeuse...",,,dc@zeusevents.co.uk,,,"(the 9th - 11th August 2019, DATE)"
9,1085016029560897537,BORN IN 01.09.2000\r\nLIVES IN UDUVIL :JAFFNA:...,"[(01.09.2000, CARDINAL), (UDUVIL, GPE), (JAFFN...","{SRI LANKA, JAFFNA, UDUVIL}",,,,,"{(01.09.2000, CARDINAL), (J/KOKUVIL HINDU COLL..."
