# Dissertation code base (Version 1.1)

## Title: Natural Language Processing and Security - Scraping

### Created by: Samuel William Mason
### Student no: 210418060

# Import libraries

In [1]:
# Primary libraries
import pandas as pd
import numpy as np
import string 
import timeit
import snscrape.modules.twitter as sntwitter
import nest_asyncio
nest_asyncio.apply()
import twint

# Scraping tweets - search for educational tweets from relating indivdiuals 

In [2]:
def scrape_individual_normal(): # Search for educational tweets from relating indivdiuals
    
    # Creating list to append scraped tweet data to
    twitter_tweets_list = []

    # add some tweets with higher education tags, to search for relating tweets
    Education_tags = ['University of Oxford', 'University of Cambridge', 'Imperial College London', 
                      'University of Edinburgh', 'University of Manchester', 'King’s College London',
                      'London School of Economics', 'University of Bristol', 'The University of Warwick', 
                      'Newcastle University', 'University of Glasgow']
    
    for i in range(len(Education_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('%s near:"Leeds" within:700km since:2017-10-01 until:2022-07-01' % (Education_tags[i])).get_items()):
            if i>2000:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 0])
    
    # Creating a dataframe from the list above
    df = pd.DataFrame(twitter_tweets_list, columns=['Datetime', 'Tweet Id', 'Username', 'Text', 'Retweet', 'Like', 'Quote', 'Reply', 'Media', 'Language', 'Outlinks', 'Source', 'Status'])
    
    # Print dataframe credentials
    print("\n Higher education dataset")
    
    # Check for duplicates in dataframe
    print("\n Total no of duplicate tweets: " + str(df.duplicated(subset='Tweet Id').sum()) + "\n") # Check for duplicate values

    # Drop duplicates in dataframe
    df=df.drop_duplicates(subset=['Tweet Id']) # drop duplicate values
    
    # Print head of dataframe
    print(df.head())
    
    # Final check for NaN values
    df.isna().any() # Check for "NaN" values
    
    # Save data to CSV
    df.to_csv('non_phishing_tweets_individual.csv')

# Scraping tweets -  search for educational tweets from verified university accounts

In [3]:
def scrape_university_normal(): # Search for educational tweets from universities
    
    # Creating list to append scraped tweet data to
    twitter_tweets_list = []

    # add some tweets with higher education tags, to search for relating tweets
    Education_tags = ['UniofOxford', 'Cambridge_Uni', 'imperialcollege', 'EdinburghUni', 'OfficialUoM',
                      'KingsCollegeLon', 'LSEnews', 'BristolUni', 'warwickuni', 'UniofNewcastle', 'UofGlasgow']
    
    for i in range(len(Education_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:%s since:2017-10-01 until:2022-07-01' % (Education_tags[i])).get_items()):
            if i>2000:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 0])
    
    # Creating a dataframe from the list above
    df = pd.DataFrame(twitter_tweets_list, columns=['Datetime', 'Tweet Id', 'Username', 'Text', 'Retweet', 'Like', 'Quote', 'Reply', 'Media', 'Language', 'Outlinks', 'Source', 'Status'])
    
    # Print dataframe credentials
    print("\n Higher education dataset")
    
    # Check for duplicates in dataframe
    print("\n Total no of duplicate tweets: " + str(df.duplicated(subset='Tweet Id').sum()) + "\n") # Check for duplicate values

    # Drop duplicates in dataframe
    df=df.drop_duplicates(subset=['Tweet Id']) # drop duplicate values
    
    # Print head of dataframe
    print(df.head())
    
    # Final check for NaN values
    df.isna().any() # Check for "NaN" values
    
    # Save data to CSV
    df.to_csv('non_phishing_tweets_university.csv')

# Scraping tweets - search for educational phishing tweets based upon known phishing langauge and suspicious domains

In [4]:
def scrape_domain_phishing(): # Search for educational phishing tweets based upon known phishing langauge and suspicious domains
    
    # Creating list to append scraped tweet data to
    twitter_tweets_list = []
    
    # add some tweets with known education related phishing domains, to idenitfy and quantify those tweets that are illigitamte and containe malicious language
    Phishing_tags = ['academia-marcial.cu.ma', 'academia.edu', 'academiacge.com.br', 'academiadederechonotarial.org',
'academiadoacucar.com.br', 'academiadovolei.org', 'academiaencore.com', 'academialtacostura.com.ve',
'academiapatriciadias.com.br', 'academiasion.com', 'academiatech.club', 'academic.ie',
'academichighkundal.com','academiclogo.wademcdonald.com', 'academicmiracles.com', 'academicoptionsireland.com',
'academy.prosv.ru', 'academyforgirls.com', 'academymediaworks.com', 'academy.mdbrasil.com.br', 'universitylanguageschool.com',
'educationalplanet.cu.ma', 'educationbdinfo.com', 'educationequalityalliance.org.au', 'educationgrantapproval.com',
'educationjobbd.com', 'educationpremise.com', 'hilltopschools.com', 'scholarship.ps', 'school-my-class.ru',                    
'school.cyfrovychok.ua', 'school37-vlg.ru', 'school6serp.ru', 'school8.kvz.kubannet.ru', 'schoolbellsystems.com',
'schoolboyish-apport.000webhostapp.com', 'schoolbricks.in', 'schoolfoodshare.com', 'schoolkutti.com',
'schoolnotes.com', 'schooloflyceum.com', 'schoolofmastery.org', 'schoolofselfawareness.com', 'schoolofskills.pro',
'schoolscap.com', 'schoolspamers.com', 'schoolstore.co.kr', 'owa-university.website', 'sanfranciscostateuniversity1.godaddysites.com',
'universityofthephilippines-onlineservices.weebly.com']
    
    for i in range(len(Phishing_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('university %s' % (Phishing_tags[i])).get_items()):
            if i>80:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 1])
    
    for i in range(len(Phishing_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('higher education %s' % (Phishing_tags[i])).get_items()):
            if i>80:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 1])
    
    # Creating a dataframe from the list above
    df = pd.DataFrame(twitter_tweets_list, columns=['Datetime', 'Tweet Id', 'Username', 'Text', 'Retweet', 'Like', 'Quote', 'Reply', 'Media', 'Language', 'Outlinks', 'Source', 'Status'])
    
    # Print dataframe credentials
    print("\n Phishing dataset")
    
    # Check for duplicates in dataframe
    print("\n Total no of duplicate tweets: " + str(df.duplicated(subset='Tweet Id').sum()) + "\n") # Check for duplicate values
    
     # Drop duplicates in dataframe
    df=df.drop_duplicates(subset=['Tweet Id']) # drop duplicate values
    
    # Print head of dataframe
    print(df.head())
    
    # Final check for NaN values
    df.isna().any() # Check for "NaN" values
    
    # Save data to CSV
    df.to_csv('phishing_tweets_domain.csv')

# Scraping tweets - search for educational phishing tweets based upon known phishing langauge and suspicious links

In [5]:
def scrape_URL_phishing(): # Search for educational phishing tweets based upon known phishing langauge and suspicious links
    
    # Creating list to append scraped tweet data to
    twitter_tweets_list = []
    
    # add some tweets with known education related phishing URl's, to idenitfy and quantify those tweets that are illigitamte and containe malicious language
    Phishing_tags = ['academia-marcial.cu.ma', 'academia.edu', 'academiacge.com.br', 'academiadederechonotarial.org',
'academiadoacucar.com.br', 'academiadovolei.org', 'academiaencore.com', 'academialtacostura.com.ve',
'academiapatriciadias.com.br', 'academiasion.com', 'academiatech.club', 'academic.ie',
'academichighkundal.com','academiclogo.wademcdonald.com', 'academicmiracles.com', 'academicoptionsireland.com',
'academy.prosv.ru', 'academyforgirls.com', 'academymediaworks.com', 'academy.mdbrasil.com.br', 'universitylanguageschool.com',
'educationalplanet.cu.ma', 'educationbdinfo.com', 'educationequalityalliance.org.au', 'educationgrantapproval.com',
'educationjobbd.com', 'educationpremise.com', 'hilltopschools.com', 'scholarship.ps', 'school-my-class.ru',                    
'school.cyfrovychok.ua', 'school37-vlg.ru', 'school6serp.ru', 'school8.kvz.kubannet.ru', 'schoolbellsystems.com',
'schoolboyish-apport.000webhostapp.com', 'schoolbricks.in', 'schoolfoodshare.com', 'schoolkutti.com',
'schoolnotes.com', 'schooloflyceum.com', 'schoolofmastery.org', 'schoolofselfawareness.com', 'schoolofskills.pro',
'schoolscap.com', 'schoolspamers.com', 'schoolstore.co.kr', 'http://clickbankuniversity.cu.ma',
'https://owa-university.website', 'https://sanfranciscostateuniversity1.godaddysites.com', 'https://login-inc.is-a-student.com',
'http://allnineoverseaseducation.com', 'http://confirm-your-account.6eeducation.com', 'http://education.calvaryhospital.org',
'http://educationjobbd.com', 'http://ekatvameducation.com']

    # Iterate through list of known phishing URL's and add them to our list
    #path_to_file = "phishing-links-ACTIVE-TODAY.txt"
    #a_file = open(path_to_file, "r", encoding="utf8")
    #for line in a_file:
        #stripped_line = line.strip()
        #Phishing_tags.append(stripped_line)
    #a_file.close()
    
    for i in range(len(Phishing_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('university %s' % (Phishing_tags[i])).get_items()):
            if i>80:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 1])
    
    for i in range(len(Phishing_tags)):
        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('higher education %s' % (Phishing_tags[i])).get_items()):
            if i>80:
                break
            # Create a list of scraped tweets
            twitter_tweets_list.append([tweet.date, tweet.id, tweet.user.username, tweet.content, tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.replyCount, tweet.media, tweet.lang, tweet.outlinks, tweet.source, 1])

    # Creating a dataframe from the list above
    df = pd.DataFrame(twitter_tweets_list, columns=['Datetime', 'Tweet Id', 'Username', 'Text', 'Retweet', 'Like', 'Quote', 'Reply', 'Media', 'Language', 'Outlinks', 'Source', 'Status'])
    
    # Print dataframe credentials
    print("\n Phishing dataset")
    
    # Check for duplicates in dataframe
    print("\n Total no of duplicate tweets: " + str(df.duplicated(subset='Tweet Id').sum()) + "\n") # Check for duplicate values
    
     # Drop duplicates in dataframe
    df=df.drop_duplicates(subset=['Tweet Id']) # drop duplicate values
    
    # Print head of dataframe
    print(df.head())
    
    # Final check for NaN values
    df.isna().any() # Check for "NaN" values
    
    # Save data to CSV
    df.to_csv('phishing_tweets_url.csv')

# Pipeline function (Run to scrape all Twitter data)

In [6]:
def pipeline(): # Run all batch processing of data in one pipeline
    
    # Start time
    start = timeit.default_timer()
    
    print("\n Scraping commencing \n")
    
    print("-----------------------------------------------------------------------------")
    # Scrape normal data
    scrape_individual_normal()
    # Elapsed
    elapsed1 = timeit.default_timer() - start
    print("-----------------------------------------------------------------------------")
    # Scrape normal data
    scrape_university_normal()
    # Elapsed
    elapsed2 = timeit.default_timer() - start
    print("-----------------------------------------------------------------------------")
    # Scrape phishing data
    scrape_domain_phishing()
    # Elapsed
    elapsed3 = timeit.default_timer() - start
    print("-----------------------------------------------------------------------------")
    # Scrape phishing data
    scrape_URL_phishing()
    # Elapsed
    elapsed4 = timeit.default_timer() - start
    print("-----------------------------------------------------------------------------")
    
    print("\n Scraping finished \n")

    #calculate elasped time
    elapsed_final = timeit.default_timer() - start
    
    # Print timings
    print("Start time: " + str(start) + "Seconds (S)")
    print("Time taken for batch 1: " + str(elapsed1) + " Seconds (S)")
    print("Time taken for batch 2: " + str(elapsed2) + " Seconds (S)")
    print("Time taken for batch 3: " + str(elapsed3) + " Seconds (S)")
    print("Time taken for batch 4: " + str(elapsed4) + " Seconds (S)")
    print("Total elasped time: " + str(elapsed_final) + " Seconds (S)")

# Call pipeline

In [7]:
pipeline()


 Scraping commencing 

-----------------------------------------------------------------------------

 Higher education dataset

 Total no of duplicate tweets: 17

                   Datetime             Tweet Id         Username  \
0 2022-06-29 04:59:30+00:00  1542009841832677376        GuyOxford   
1 2022-06-22 07:10:37+00:00  1539506125988581376       CaroBarnes   
2 2022-06-22 06:06:21+00:00  1539489950286200833        GuyOxford   
3 2022-06-20 15:36:51+00:00  1538908748491673602  utkarsh_amitabh   
4 2022-06-19 14:23:53+00:00  1538527995685634054    KathrynMcGurk   

                                                Text  Retweet  Like  Quote  \
0  Just posted a photo @ Oriel College, Universit...        0     0      0   
1  Box fresh bow tie and bands ready for Encaenia...        0     2      0   
2  Just posted a photo @ Exeter College, Universi...        0     0      0   
3  Blessed to be sharing the @netcapglobal story ...        1     8      0   
4  Excited to be spending this