In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell

import math
import utils
import shutil as shl
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict
from scipy.stats import pearsonr

from datetime import datetime

In [None]:
#shl.unpack_archive("dataset/users.zip", "dataset") # unpacks the users.zip into the datasets folder (The users zip is small enough to be commited to github if we would like)
#shl.unpack_archive("dataset/tweets.zip", "dataset") # unpacks the tweets.zip into the datasets folder

In [None]:
#tweets_df = pd.read_csv("dataset/tweets_small.csv")
tweets_df = pd.read_csv("dataset/tweets.csv", usecols=["id", "user_id"])
#tweets_df = pd.read_csv("dataset/tweets.csv")
tweets_df.head()

In [None]:
from operator import index

users_df = pd.read_csv("dataset/users.csv")
#users_df.reset_index()
#users_df.head()

# Data Understanding

As you can see from comparing the table above and the function results below, we already have noticed that some values are not as expected. For example all the attributes below are states as objects, but we expect many of them to be numbers. This suggests that we will have to prepare and clean the data thouroghly before they can be used.

## Users Data

In [None]:
users_df.head()

### Assessing and fixing data quality

In [None]:
users_df.describe()

In [None]:
users_df.info(verbose=True, show_counts=True, memory_usage= "deep")

We observe that the created_at coloumn is recognized by pandas as an object, and not as a datetime as we would expect from this attribute. We check the values for null values.

In [None]:
users_df.isnull().any()

In [None]:
# how to handle null values in statuses count? does it make sense to use mean/medien in a power law distribution? 

Clean created_at field, by converting string to datetime and checks if all the tweets were created after the first tweet published on twitter (so we don't have something strange like a tweet created in 01-01-1990)

In [None]:
# parsing string to datetime obj
users_df["created_at"] = pd.to_datetime(users_df["created_at"])

# checks if all the tweets were created after the first tweet published on twitter (so we don't have something strange like a tweet created in 01-01-1990)
twitter_first_tweet_datetime = datetime(2006,3,21,12,50,0)
#string_to_datetime = lambda string: datetime.strptime(string, expected_format)
published_after_twitter_first_tweet = lambda x: x > twitter_first_tweet_datetime
all(map(published_after_twitter_first_tweet, users_df["created_at"]))


let's see the unique number of ids and the actual number of ids...

In [None]:
num_unique_ids = len(pd.unique(users_df["id"]))
num_ids = len(users_df["id"])
print(f"number of unique IDs: {num_unique_ids} and number of IDs: {num_ids}")

we could do a similar operation for the names, but people with the same name is not an error. For names it is more interesting to know how many values are missing in total.

In [None]:
def my_isnan(a):
    return a != a

def my_isempty(a):
    if a == "":
        return True


number_of_total_names = len(users_df["name"])
not_empty_or_missing_names = []
empty_or_missing_names = []
names_with_only_spaces = []

# iterate over all names looking for errors
for value in users_df["name"]:
    if my_isnan(value) or my_isempty(value): # name is nan or is_empty string
        #print(users_df["name"])
        #print(users_df["name"].index(value))
        empty_or_missing_names.append(value)
    if str(value).strip() == "":
            names_with_only_spaces.append(value)
            #users_df["name"].drop(index=value)
    elif not(my_isnan(value) or my_isempty(value)):
        #print(users_df["name"])
        #print(users_df["name"].index(value))
        not_empty_or_missing_names.append(value)
print(f" Number of total names = {number_of_total_names} vs total name values that are not NA or empty = {len(not_empty_or_missing_names)}")

"""
for value in users_df["name"]:
    if my_isnan(value) or my_isempty(value):
        #print(users_df["name"])
        #print(users_df["name"].index(value))
        empty_or_missing_names.append(value)
        #users_df["name"].drop(index=value)
    #else:
    #    not_empty_or_missing_names.append(value)
"""
print(f" Number of total names = {number_of_total_names} vs total name values that are NA or empty = {len(empty_or_missing_names)}")

#print(len(names_with_only_spaces))
#print(empty_or_missing_names)
#print(not_empty_or_missing_names)
#array_of_nan = np.isnan(users_df["name"])

#users_df["name"][1012]
#users_df["name"][0]
#empty_or_missing_names


The "lang" field is composed of [IETF language codes](https://en.wikipedia.org/wiki/IETF_language_tag). By selecting only the unique values it's possible to see that there are some erroneous values (e.g. "Select Language...", "xx-lc"); also, there are some values that are not properly correct (e.g. "zh-cn" instead of "zh-CN").

Since wrong values are just the 0.02% of the number of rows they are just dropped, while the other values are mapped to the correct ones.

In [None]:
wrong_fields = ["Select Language...", "xx-lc"] # only 3 elements
to_map_fields = {
    "en-gb": "en-GB",
    "zh-tw": "zh-TW",
    "zh-cn": "zh-CN",
    "fil": "fil-PH"
}

# dropping wrong fields
wrong_index = lambda x: True if x[1] in wrong_fields else False
wrong_indexes = [index for (index, _) in filter(wrong_index, enumerate(users_df["lang"]))]
users_df.drop(index=wrong_indexes, inplace=True)

# mapping incorrect values to fixed ones
for language in to_map_fields:
    indexes = users_df[users_df["lang"] == language].index
    for index in indexes:
        old_language = users_df.loc[index,"lang"]
        users_df.loc[index,"lang"] = to_map_fields[old_language]

bot: should be yes or no; maybe there are wrong values?

In [None]:
pd.unique(users_df["bot"])

"created at" should be a time; check if there are type error or if time value is strange (e.g. tweet made before twitter release, which was march 21 2006)

In [None]:
# checks if all the datetime strings are in the valid format (YY-mm-dd H:M:S)
#expected_format = "%Y-%m-%d %H:%M:%S"
#is_datetime_format_correct = lambda x: utils.is_datetime_format_correct(x, expected_format)
#all(map(is_datetime_format_correct, users_df["created_at"]))



### Distribution of variables and statistics
Let's study them!

In [None]:
users_df.hist(
    column=["statuses_count"], 
    log=True, 
    bins=utils.get_sturges_bins(len(users_df["statuses_count"]))
)

users_df.hist(
    column=["statuses_count"], 
    by="bot", 
    log=True,
    bins=utils.get_sturges_bins(len(users_df["statuses_count"])) #FIX THIS: USES ALL THE SAMPLES, NOT JUST THE BOTS AND THE USERS
)

In [None]:
langs = pd.unique(users_df["lang"]) 
bot_freqs = []
user_freqs = []
for lang in langs:
    user_freqs.append(len(users_df.query(f"lang == '{lang}' & bot == 0")))
    bot_freqs.append(len(users_df.query(f"lang == '{lang}' & bot == 1")))

langs = pd.unique(users_df["lang"])
langs_df = pd.DataFrame({"lang": langs, "bot_freqs": bot_freqs, "user_freqs": user_freqs})
langs_df.plot.bar(x="lang", logy=True)

## Tweets data

### Assessing data quality

In [None]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep")

In [None]:
tweets_df.isnull().any() 

keeping only the tweets with user_id in user dataset

In [None]:
user_ids = users_df["id"].values
user_ids.dtype
parsed_user_ids = pd.to_numeric(tweets_df["user_id"], errors="coerce")
parsed_user_ids.dtype
tweets_df[parsed_user_ids.isin(user_ids)]

clean id field by first removing nan values (just 2), then tring to cast to int and removing duplicates

In [None]:
#tweets_df.dropna(subset=["id"], inplace=True)

# removing not numeric strings
#pd.to_numeric(tweets_df["id"])

#tweets_df["id"].isin(users)

From the method above we observe that all our atributes except for "created_at" have one or more elements with null values.

In [None]:
"""
wrong_ids = []
for (i,k) in enumerate(df["id"]):
    if not isinstance(k, str) or not k.isnumeric():
        wrong_ids.append(i)
print(len(wrong_ids)/len(df["id"]))
""" 

### Distribution of variables and statistics
Let's study them!

In [None]:
# should we use sturgen rule for number of bins?

# give error: ValueError: hist method requires numerical or datetime columns, nothing to plot.
#tweets_df.hist(column=["reply_count","retweet_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"])

In [None]:
tweets_df.describe()

### Variables trasformations (?)

### Pairwise correlations