# Walkthrough

In [1]:
import os
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
from transformers import GPT2Model, GPT2Config
import random

# 1. Data Gathering Methods

### METHOD 1
We first need to aquire our data. We choose the name of the account as well as the number of most recent tweets to aquire and then make the request with snscrape. The raw is then saved as a csv file.

In [2]:
# Setting variables to be used in format string command below
tweet_count = 10300
username = "elonmusk"

In [3]:
#Note: runtime is around 5 minutes
# Using OS library to call CLI commands in Python
os.system("snscrape --jsonl --max-results {} twitter-search 'from:{}'> user-tweets.json".format(tweet_count, username))
# Reads the json generated from the CLI command above and creates a pandas dataframe
tweets_df1 = pd.read_json('user-tweets.json', lines=True)

# Displays first 5 entries from dataframe
tweets_df1.head()

In [20]:
#save raw
file_name = "data/individual_sets/tweets_{user}_{count}_m1".format(user = username, count = tweet_count)
tweets_df1.to_csv(file_name+".csv", sep=',', index=False)

### Method 2
This method was test to compare an alterantive scrapping method but was discontinued later on

In [None]:
# Run the pip install command below if you don't already have the library
# !pip install git+https://github.com/JustAnotherArchivist/snscrape.git

# Run the below command if you don't already have Pandas
# !pip install pandas

# Imports
import snscrape.modules.twitter as sntwitter
import pandas as pd


In [None]:
# Creating a dataframe from the tweets list above
tweets_df1 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
tweets_df2 = pd.DataFrame(tweets_list1)
# Display first 5 entries from dataframe
tweets_df2.head()

In [None]:
# Export dataframe into a CSV
file_name = "data/"+"tweets_{user}_{count}_m1".format(user = username, count = maxTweets)
tweets_df2.to_csv('user-tweets2.csv', sep=',', index=False)

In [None]:
tweets_df1.iloc[:, 2]
# raw_html = tweets_df1.iloc[0, 2]

In [None]:
#Clean text experiment:

print("original\n", tweets_df1.iloc[0, 2])

clean_text = BeautifulSoup( tweets_df1.iloc[0, 2], "lxml").get_text(strip=True)
print("filtered\n",clean_text)

# 2. Cleaning dataset
1. Remove all retweets
Filter text
1. Remove all url and picture links
2. Remove short tweets
3. Remove extra spaces 
4. Special character encoding

Save File 
1. Save to csv file

In [2]:
#import API
from api.filter import filter_manual, gen_input
import pandas as pd
#Read in data 
# file_name = file_name+".csv" #Insert file here
file_name = 'data/individual_sets/tweets_elonmusk_20000_m1.csv'
lang_choice = 'en'
length_min = 4 #minimum tweet length


data = pd.read_csv(file_name)
df = data[['date','renderedContent', 'lang', 'sourceLabel', 'outlinks','media','retweetedTweet','quotedTweet','inReplyToUser','place']].copy()

#Filter then Save as csv and txt
data_new = filter_manual(df, length_min, lang_choice)
data_new_lst = []
data_new_lst.append(data_new)
epochs = 4 #Shuffle parameter, see gen_input
data_save = gen_input(data_new_lst,epochs)
#TXT:
with open("data/individual_sets/"+"cleaned_{user}_{count}".format(user = username, count = tweet_count)+'.txt', 'w',encoding='utf-8') as f:
    # for tweet in data_new:
    #     f.write(tweet)
    #     # print(tweet,"AHHH")
    #     f.write('\n')
    f.write(data_save)

    f.close()


df_clean = pd.DataFrame(data_new,columns=["Tweets"])
file_name = "data/"+"cleaned_{user}_{count}_m1".format(user = username, count = tweet_count)
df_clean.to_csv(file_name+".csv", sep=',', index=False)

  data = pd.read_csv(file_name)


# 3.1 Politeness filtering: Unconstrained Model
Here, we can play with politeness filtering by reading in specific files and getting the top x% of impolite or polite texts. This also lets us create specific subsets of politeness data that is then used in the unconstrained models.

In [None]:
#import API
from api.filter import filter_manual, gen_input, gen_input_special_tokens
from api.politeness import generate_politeness
import pandas as pd
#Stuff to run for first time:
# python -m spacy download en_core_web_sm
# pipip uninstall emoji
# pip install emoji==1.7

#Read in data 
############################################################
#PARAMETERS:
username = 'paddingtonbear'
tweet_count = 103000
file_name = 'data/individual_sets/cleaned_paddingtonbear_10300_m1.csv'
lang_choice = 'en'
length_min = 4 #minimum tweet length, unused here but good to have in mind
#politeness params
corpus_train = 'wikipedia'
percentage_top_tweets = 0.5
polite_or_impolite = 'impolite'
############################################################



df = pd.read_csv(file_name)
df_politeness = generate_politeness(df, corpus_train, percentage_top_tweets,polite_or_impolite) #choose by politeness
print("number of data points",len(df_politeness))
data_new = df_politeness["Tweets"]
data_new_lst = []
data_new_lst.append(data_new)
epochs = 4 #Shuffle parameter, see gen_input

data_save = gen_input(data_new_lst,epochs)
with open("data/individual_sets/"+"{polite}_{user}_{count}_{perc}_finalnum{total}".format(polite = polite_or_impolite, user = username, count = tweet_count,perc=percentage_top_tweets,total=len(df_politeness))+'.txt', 'w',encoding='utf-8') as f:
    # for tweet in data_new:
    #     f.write(tweet)
    #     # print(tweet,"AHHH")
    #     f.write('\n')
    f.write(data_save)

    f.close()


We now have all the data for the uncontrained models. The next steps are to generate the data for the uncontrained models which will take the cleaned data we generated in Section 2. This will be done in the file "Full_Model_Processing_Constrained.ipynb"