In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell
import math
import utils
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from operator import index
from collections import defaultdict
from scipy.stats import pearsonr
from datetime import datetime

In [None]:
tweets_df = pd.read_csv("dataset/tweets.csv")
users_df = pd.read_csv("dataset/users_dataset_cleaned.csv")

## Tweets Data Understanding and Preparation



In tweets.csv each row contains information about a single tweet. There are 10 columns and In this case the variables are:

1. ID: a unique identifier for the tweet

2. User Id: a unique identifier for the user who wrote the tweet

3. Retweet count: number of retweets for the tweet in analysis

4. Reply count: number of reply for the tweet in analysis

5. Favorite count: number of favorites (likes) received by the tweet

6. Num hashtags: number of hashtags used in the tweet

7. Num urls: number of urls in the tweet

8. Num mentions: number of mentions in the tweet

9. Created at: when the tweet was created

10. Text: the text of the tweet


### Attribute type and quality

In [None]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 

## 1. Tweet_id & User_id Columns

keeping only the tweets with user_id in user dataset. As these are the ones we would like to study, and have the data to verify the validity of.

In [None]:
tweets_df.rename(columns= {"id" : "tweet_id"}, inplace=True)

In [None]:
before_dropping_rows_number = len(tweets_df.index)

numeric_ids = pd.to_numeric(tweets_df["user_id"], errors="coerce")
ids_are_not_in_users_df = numeric_ids[numeric_ids.isin(users_df["user_id"]) == False]
tweets_df.drop(ids_are_not_in_users_df.index, inplace=True)

tweets_df["user_id"] = pd.to_numeric(tweets_df["user_id"], errors="coerce")

after_dropping_rows_number = len(tweets_df.index)

print(f"Percentage of tweets whose author id isn't inside the users dataframe: {(100*(before_dropping_rows_number-after_dropping_rows_number))/(before_dropping_rows_number)}")

These tweets are dropped from the tweets dataframe.

clean id field by casting to int

In [None]:
tweets_df["tweet_id"] = pd.to_numeric(tweets_df["tweet_id"], errors="coerce") # cast field to int and set invalid values to NaN

removing tweets which are duplicates on every attribute

In [None]:
original_number_rows = len(tweets_df.index)

all_columns_duplicated_df = tweets_df[tweets_df.duplicated(subset=None, keep="first")]
all_columns_duplicated_number = len(all_columns_duplicated_df.index)

print(f"Percentage of tweets duplicated along all the columns that we are deleting(after keeping the first instance): {(100*(all_columns_duplicated_number))/original_number_rows}")

tweets_df.drop(labels=all_columns_duplicated_df.index, inplace=True)

Dropping tweets whose id is an invalid value (i.e. NaN, duplicate, inf)

In [None]:
before = tweets_df.size
tweets_df = tweets_df[tweets_df['tweet_id'].notna()]
after= tweets_df.size
print(f"Percentage of tweets with nan tweet_id dropped: {(100*(before - after))/before}")

In [None]:
before = tweets_df.size
tweets_df = tweets_df.drop_duplicates(subset="tweet_id")
after = tweets_df.size
print(f"Num of tweets with same tweet_id dropped: {(before - after)}")

In [None]:
before = tweets_df.size
tweets_df = tweets_df[tweets_df['tweet_id']!=np.inf]
after = tweets_df.size
print(f"Num of tweets with inf tweet_id dropped: {(before - after)}")

We have decided to drop all tweets with NaN values as their ID. As it is difficult for us to ensure their validity. We have also decided to drop all tweets with duplicated IDs. Keeping only its first instance. 

As we can see from the data above. Pandas treats tweets with fields containing NaN values as different from each other. Atleast when it comes to ID. 

Lets see if we can find the tweets with non-NaN user-id values.

In [None]:
before = tweets_df.size
tweets_df = tweets_df[tweets_df['user_id'].notna()]
after= tweets_df.size
print(f"Percentage of tweets with nan user_id dropped: {(100*(before - after))/before}")

Convert tweet_id attribute to int64

In [None]:
tweets_df["tweet_id"] = tweets_df["tweet_id"].astype(np.int64)
tweets_df.info()

## 3. Numerical columns

- Retweet_count
- Reply_count
- favorite_count
- Num_hashtags
- Num_urls
- num_mentions

In [None]:
columns = ["retweet_count", "reply_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"]
tweets_df[columns].describe()

The following thresholds are based on the most retweet and "liked" tweets on the twitter platform

In [None]:
thresholds = {
   "retweet_count": 3738380,
   "favorite_count": 7114892,
   "reply_count" : 4000000,
   "num_hashtags" : 93,
   "num_urls" : 23,
   "num_mentions": 93
}

print("Number of invalid values for the following columns:")
for column_name in columns:
   # casting all the columns to numeric (and setting invalid value to null)
   tweets_df[column_name] = pd.to_numeric(tweets_df[column_name], errors="coerce")

   print(f"{column_name}")
   # evaluating the presence of negative values
   negative_series = tweets_df[tweets_df[column_name] < 0][column_name]
   print(f"\tnegative: {len(negative_series)}")
   tweets_df[column_name].loc[negative_series.index] = np.NaN # setting negative values to NaN
   
   # evaluating the presence of inf values
   inf_series = utils.get_inf_elements(tweets_df[column_name])
   print(f"\tinf: {len(inf_series)}")
   tweets_df[column_name].loc[inf_series.index] = np.NaN # setting inf values to NaN

   # evaluating the presence of decimal values
   is_float = lambda n: (not pd.isna(n)) and (not math.isinf(n)) and (not n.is_integer())
   decimal_values_series =  tweets_df[column_name].apply(is_float)
   print(f"\tdecimals: {len(tweets_df[column_name][decimal_values_series])}")
   
   # finding values above given thresholds (if specified)
   if column_name in thresholds:
       threshold = thresholds[column_name]
       above_threshold_series = tweets_df[tweets_df[column_name] > threshold ][column_name]
       print(f"\tvalues above threshold ({threshold}): {len(above_threshold_series[above_threshold_series > threshold])}") 
       tweets_df[column_name].loc[above_threshold_series.index] = np.NaN # setting values above threshold to nan



The first observation that can be made is that \*_count and num_\* fields should contain only positive integers. Something that they do.
We find some inf values in the following columns:
- retweet_count
- reply_count

We also find values that are above our treshold range: 
- 36 tweets above the retweet_count treshold
- 34 tweets above the favorite_count treshold


All these values and the infinate values are set to NaN.


### Parsing columns to integer

Substituting NaN values with median

In [None]:
tweets_df[columns].mean()

for column in columns:
    # column_median_value = tweets_df[column].median()
    # Set to nan the values higher than the max integer 
    # Probably noise values
    tweets_df[column].mean()
    tweets_df[tweets_df[column] > float(np.iinfo(np.int64).max)][column] = np.nan
    # Nans to median
    # tweets_df.replace( to_replace = np.nan, value = column_median_value, inplace = True )
    tweets_df[column] = tweets_df.groupby("user_id")[column].apply(lambda x: x.fillna(int(x.median())))
    # Cast column to int
    tweets_df[column] = tweets_df[column].astype(np.int64)

In [None]:
tweets_df[columns].quantile([0.25, 0.5, 0.75, 0.87, 0.95, 0.99, 0.999, 0.9999, 0.99999, 0.999999])
tweets_df[columns].mean()
tweets_df[columns].median()
tweets_df[columns].describe()

## 4. Created_at Column

created_at should be datetime

Checks if all the tweets were created after the first tweet published on twitter (so we don't have something strange like a tweet created in 01-01-1990)

In [None]:
tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"], errors="coerce")

# finding tweets created before twitter first tweet
before_time_tweets_df = tweets_df[tweets_df["created_at"] < datetime(2006,3,21,12,50,0)]

# finding tweets created after dataset release
after_time_tweets_df = tweets_df[tweets_df["created_at"] > datetime(2022,9,29,11,0,0)]

# dropping out of range tweets
tweets_df = tweets_df.drop(before_time_tweets_df.index)
tweets_df = tweets_df.drop(after_time_tweets_df.index)

## 5. Text Column

In [None]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 
tweets_df.describe()

Removing tweets with null text field, float text field or only spaces, because these are not allowed by twitter

In [None]:
tweets_df.dropna(subset=["text"], inplace=True) # drop the tweets where the text field is null
tweets_df["text"] = tweets_df["text"].astype(str) # cast the text field to string
tweets_df = tweets_df[~tweets_df.text.str.isspace()]

tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 
tweets_df.describe()
# Problem with memory usage, we have to find a more effient way to remove tweets with only spaces
# We should also drop the tweets only containing the U+3164 HANGUL FILLER, or other invisible charackters.
#Source: https://invisible-characters.com/
"""
to_drop = []
for index, row in tweets_df.iterrows():
    if row["text"].isspace():
        tweets_df.drop(index)
"""

### Distribution of variables and statistics
Let's study them!

Histograms for numerical fields

In [None]:
columns = ["retweet_count", "reply_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"]

tweets_df.hist(
    column=columns, figsize=(10,8),
    log=True,
    #bins=utils.get_sturges_bins(tweets_df.size)
)

Boxplots for the numerical fields.

In [None]:
tweets_df[columns].plot(kind="box",
        figsize=(10,8),
        logy=True, 
    )
"""
for column in columns:
    tweets_df.plot(kind="box",
        column=column,
        logy=True
    )
    """

In [None]:
# #tweets_df[columns].quantile([0.05,0.25,0.5,0.75,0.95, 0.97,0.99])
# #tweets_df[tweets_df[columns] != np.nan][columns].mean()

# tweets_df[tweets_df["retweet_count"] == "NaN"]
# tweets_df[tweets_df["reply_count"] == np.nan]
# tweets_df[tweets_df["favorite_count"] == np.nan]
# tweets_df[tweets_df["num_hashtags"] == np.nan]#["retweet_count"]
# tweets_df[tweets_df["num_urls"] == np.nan]
# tweets_df[tweets_df["num_mentions"] == np.nan]
# #tweets_df[columns].loc[:, tweets_df[columns].isna().any()]

# tweets_df.isna()

tweets_df[columns].mean()
tweets_df[columns].median()

In [None]:
tweets_df[tweets_df['reply_count'] < 0]

In [None]:
tweets_df.head()

By looking at the boxplots we can see that there is a very high variance for the data (an observation supported also by the previous histograms). Hence, we compute the whiskers upper and lower bounds analitically for each column.

In [None]:
# outlier detection
outlier_thresholds = {
    column: utils.compute_whiskers(tweets_df[column])
    for column in columns
}

outlier_dataframes = {
    column: tweets_df[tweets_df[column] > outlier_thresholds[column][1]][column]
    for column in columns
}

for column in columns:
    print(f"{column}: {len(outlier_dataframes[column])} outliers")

# Should add what percentage of the dataframe that this consists of.
# In this way we get a feeling of how many outliers our cleaned dataset consists of


These outliers may be removed from the dataframe by running the code block bellow.

In [None]:
# # removal of outliers
# outlier_indexes_list = [outlier_dataframe.index for outlier_dataframe in outlier_dataframes.values()]
# for indexes in outlier_indexes_list:
#     tweets_df.drop(indexes, inplace=True, errors="ignore")

## Tweet Data Quality Summary

In [None]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 

In [None]:
# This codeblock should compute the number of tweets belonging to human and bot accounts
# It should also compute the percentage of this data, to come up with a number for balance

# We also have to decide if we should remove the remaining tweets that include NaN values.
# These are around 400 thousand of our 10 million four hundred thousand remaining tweets.
# So basically less than 4%

# Number_of_users_in_dataset_that_are_bots = users_df.groupby("bot").get_group(1)
# Number_of_users_in_dataset_that_are_humans = users_df.groupby("bot").get_group(0)
# total = len(Number_of_users_in_dataset_that_are_bots) + len(Number_of_users_in_dataset_that_are_humans)
# print(f"Out of the {total} users in our dataset. {len(Number_of_users_in_dataset_that_are_humans)} are humans, and {len(Number_of_users_in_dataset_that_are_bots)} are bots.")
# print(f"The dataset consists of {round(100*(len(Number_of_users_in_dataset_that_are_humans)/total),3)}% humans and {round(100*(len(Number_of_users_in_dataset_that_are_bots)/total),3)}% bots respectively.")
# print(f"399 of our users are missing their statuses_count values. These humans consist of {round(100*(len(list_of_humans)/total),3)}% of our dataset.")
# print(f"These users will be removed.")

This block should explain the the state of the cleaned and generalized dataset. Also stating how balanced it is.

From the users cleaning notebook
.......................................
After cleaning we are left with a "fairly" balanced and generalized dataset, that is ready for further use. The dataset contains approx. 45% human and 55% bot users. 
.......................................

In [None]:
tweets_df.to_csv("./dataset/tweets_dataset_cleaned.csv",index=False)