## Import Packages and Reading of Data

In [1]:
import re
import demoji
import emoji
import warnings
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew
from datetime import datetime
from datetime import date
from collections import OrderedDict
from sklearn.base import BaseEstimator, TransformerMixin
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
df_metadata = pd.read_csv("final_combined_dataset.csv")

In [3]:
df_metadata

Unnamed: 0.1,Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,0,18042464,[日向の乙女]ミルミクス,MILMIX,2008-12-11T06:18:59.000Z,False,False,https://t.co/AoobuYGh5y,https://pbs.twimg.com/profile_images/788469686...,日本,テイルズオブシリーズが好きだったり。モバマス貧弱一般人。Splatoon中毒患者。最近はFN...,1959,2708,405158,157,human,True
1,1,60286377,Tahnee Trotter,tahneetrotter,2009-07-26T11:25:30.000Z,False,False,http://t.co/P7D9W9j5nT,https://pbs.twimg.com/profile_images/488235175...,,Get frucked,371,0,33172,10,human,True
2,2,373791732,Kassandra Garcia,Sassy_Kassy37,2011-09-15T05:49:37.000Z,False,False,,https://pbs.twimg.com/profile_images/290477377...,,"If you have Motivatoin, Dedication and Faith i...",36,0,548,0,bot,True
3,3,2374895658,TonieSabella,TonieSabella,2014-03-06T06:01:31.000Z,False,False,http://t.co/yakUEMw5xd,https://pbs.twimg.com/profile_images/473894199...,Bulgaria,Ambient/Post Rock music created by Ben Leopard.,30,148,93,1,bot,True
4,4,17537004,Mihai Todor 🇺🇦,MihaiTodor,2008-11-21T14:28:51.000Z,False,False,https://t.co/Ab2jd2vhvV,https://pbs.twimg.com/profile_images/781079151...,"Dublin, Ireland",Principal Software Engineer interested in comp...,1089,2283,9098,34,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20284,20289,15367428,Paul Urmston,paulurmston,2008-07-09T16:38:23.000Z,False,False,,https://pbs.twimg.com/profile_images/161391553...,United Kingdom,,72,76,4161,1,human,True
20285,20290,17006395,Laura Zambelli,laura_trouble,2008-10-27T20:27:57.000Z,False,False,https://t.co/lFTff06wEg,https://pbs.twimg.com/profile_images/126498263...,,You heard that i was trouble but you couldn't ...,314,107,20915,9,human,True
20286,20291,22130752,The Stag Company,TheStagCompany,2009-02-27T11:31:33.000Z,False,False,http://t.co/GiEeJEGx48,https://pbs.twimg.com/profile_images/793413210...,"Brighton, UK","#Stagweekends, #stagdos and #stagnights. We do...",2981,2523,4991,46,human,True
20287,20292,15166188,kelsey lee 🖤,kelseyofzen,2008-06-19T04:57:53.000Z,False,False,,https://pbs.twimg.com/profile_images/147715462...,,cap ou pas cap?,170,180,7820,4,human,True


## ETL

In [4]:
def url(df):
    df["has_url"] = ~df["url"].isna()
    return df

def username(df):
    special_char = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    df["un_no_of_char"] = df["username"].apply(lambda x: len(str(x)))
    df["un_special_char"] = df["username"].apply(lambda x: special_char.search(str(x)) != None)
    return df

def name(df):
    special_char = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    df["name_no_of_char"] = df["name"].apply(lambda x: len(str(x)))
    df["name_special_char"] = df["name"].apply(lambda x: special_char.search(str(x)) != None)
    return df

def description(df):
    user_tags = r'\B@\w*[a-zA-Z]*\w*'
    hashtags = r'\B#\w*[a-zA-Z]+\w*'
    links = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
    df["des_no_of_usertags"] = df["description"].apply(lambda x: len(re.findall(user_tags, str(x)))) #str(x).count('@'))
    df["des_no_of_hashtags"] = df["description"].apply(lambda x: len(re.findall(hashtags, str(x)))) #str(x).count('#'))
    df["des_external_links"] = df["description"].apply(lambda x: re.findall(links, str(x)) != [])
    df["has_description"] = ~df["description"].isna() 
    return df

def location(df):
    df["location"] = ~df["location"].isna() # false = location is NaN; true = has location
    return df
  
def time(df):
    # df["created_time"] = df["created_at"].apply(lambda x : datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f%z').date())
    # d1 = date(2022, 10, 8) # date we extracted these data 
    df["account_age_in_days"] = df["created_at"].apply(lambda x : (date(2022, 10, 8) - datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f%z').date()).days)
    df["average_tweets_per_day"] = df["tweet_count"]/df["account_age_in_days"]
    return df

def follow_count(df):
    df["followers_following_count"] = df["followers_count"] * df["following_count"]
    return df

def account_type(df):
    df["isBot"] = df["account_type"].apply(lambda x : 1 if x == 'bot' else 0)
    return df

def convert_PV(variable):
    if variable == 'True':
        return True
    if variable == 'False':
        return False
    if variable == '0.0':
        return False
    if variable == '1.0':
        return True
    return variable

def convert_TF(variable):
    var = 1 if variable == True else 0
    return var

In [5]:
df_metadata = account_type(df_metadata)
df_metadata = username(df_metadata)
df_metadata = name(df_metadata)
df_metadata = description(df_metadata)
df_metadata = location(df_metadata)
df_metadata = url(df_metadata)
df_metadata = time(df_metadata)
df_metadata = follow_count(df_metadata)

pv_var = ["verified", "protected"]

for var in pv_var:
    df_metadata[var] = df_metadata[var].apply(convert_PV)

boolean_var = ["verified", "location", "un_special_char", 
               "name_special_char", "des_external_links", 
               "has_description", "protected", 
               "has_url", "has_profile_image"]

for var in boolean_var:
    df_metadata[var] = df_metadata[var].apply(convert_TF)
    
int_var = ["followers_count", "following_count", "tweet_count", "listed_count"]

for var in int_var:
    df_metadata[var] = df_metadata[var].astype(int)

In [6]:
new_df_metatdata = df_metadata.drop(columns=["Unnamed: 0", "url", "id", "created_at", "name", "username", "description", "profile_image_url", "account_type"])

In [7]:
new_df_metatdata

Unnamed: 0,protected,verified,location,followers_count,following_count,tweet_count,listed_count,has_profile_image,isBot,un_no_of_char,un_special_char,name_no_of_char,name_special_char,des_no_of_usertags,des_no_of_hashtags,des_external_links,has_description,has_url,account_age_in_days,average_tweets_per_day,followers_following_count
0,0,0,1,1959,2708,405158,157,1,0,6,0,12,0,0,0,0,1,1,5049,80.245197,5304972
1,0,0,0,371,0,33172,10,1,0,13,0,14,0,0,0,0,1,1,4822,6.879303,0
2,0,0,0,36,0,548,0,1,1,13,1,16,0,0,0,0,1,0,4041,0.135610,0
3,0,0,1,30,148,93,1,1,1,12,0,12,0,0,0,0,1,1,3138,0.029637,4440
4,0,0,1,1089,2283,9098,34,1,0,10,0,14,0,0,0,1,1,1,5069,1.794831,2486187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20284,0,0,1,72,76,4161,1,1,0,11,0,12,0,0,0,0,0,0,5204,0.799577,5472
20285,0,0,0,314,107,20915,9,1,0,13,1,14,0,0,0,1,1,1,5094,4.105811,33598
20286,0,0,1,2981,2523,4991,46,1,0,14,0,16,0,0,3,1,1,1,4971,1.004023,7521063
20287,0,0,0,170,180,7820,4,1,0,11,0,12,0,0,0,0,1,0,5224,1.496937,30600


In [8]:
new_df_metatdata.describe()

Unnamed: 0,protected,verified,location,followers_count,following_count,tweet_count,listed_count,has_profile_image,isBot,un_no_of_char,un_special_char,name_no_of_char,name_special_char,des_no_of_usertags,des_no_of_hashtags,des_external_links,has_description,has_url,account_age_in_days,average_tweets_per_day,followers_following_count
count,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0,20289.0
mean,0.121839,0.052787,0.669575,164447.5,3901.567,35548.24,408.099068,0.975307,0.276012,10.738035,0.142343,11.472966,0.044803,0.159545,0.214599,0.058258,0.8443,0.524126,4431.379762,7.65291,9765040000.0
std,0.327108,0.223614,0.470378,2742200.0,45832.97,91771.57,6016.241166,0.155192,0.447034,2.704169,0.34941,5.468766,0.206875,0.606166,0.938193,0.234237,0.36258,0.49943,737.401272,20.561953,601309500000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2438.0,0.0,0.0
25%,0.0,0.0,0.0,75.0,147.0,538.0,1.0,1.0,0.0,9.0,0.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,3854.0,0.1154,5920.0
50%,0.0,0.0,1.0,372.0,321.0,9133.0,6.0,1.0,0.0,11.0,0.0,12.0,0.0,0.0,0.0,0.0,1.0,1.0,4782.0,1.944585,115902.0
75%,0.0,0.0,1.0,1381.0,924.0,32706.0,36.0,1.0,1.0,13.0,0.0,14.0,0.0,0.0,0.0,0.0,1.0,1.0,4924.0,7.007623,1152480.0
max,1.0,1.0,1.0,133175700.0,4040264.0,2778611.0,533679.0,1.0,1.0,15.0,1.0,50.0,1.0,9.0,19.0,1.0,1.0,1.0,5931.0,819.407549,75084440000000.0


In [9]:
new_df_metatdata.to_csv("pre_metadata_dataset.csv", index = False)