In [35]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell
import math
import utils
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from operator import index
from collections import defaultdict
from scipy.stats import pearsonr
from datetime import datetime
#from lingua import Language, LanguageDetectorBuilder

In [36]:
tweets_df = pd.read_csv("dataset/tweets.csv")
users_df = pd.read_csv("dataset/users_dataset_cleaned.csv")

## Tweets Data Understanding and Preparation



In tweets.csv each row contains information about a single tweet. There are 10 columns and In this case the variables are:

1. ID: a unique identifier for the tweet

2. User Id: a unique identifier for the user who wrote the tweet

3. Retweet count: number of retweets for the tweet in analysis

4. Reply count: number of reply for the tweet in analysis

5. Favorite count: number of favorites (likes) received by the tweet

6. Num hashtags: number of hashtags used in the tweet

7. Num urls: number of urls in the tweet

8. Num mentions: number of mentions in the tweet

9. Created at: when the tweet was created

10. Text: the text of the tweet


### Attribute type and quality

In [37]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13664696 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype 
---  ------          --------------     ----- 
 0   id              13664694 non-null  object
 1   user_id         13447413 non-null  object
 2   retweet_count   13227562 non-null  object
 3   reply_count     13016818 non-null  object
 4   favorite_count  13017154 non-null  object
 5   num_hashtags    12607172 non-null  object
 6   num_urls        13016073 non-null  object
 7   num_mentions    12810531 non-null  object
 8   created_at      13664696 non-null  object
 9   text            13126975 non-null  object
dtypes: object(10)
memory usage: 8.9 GB


## 1. Tweet_id & User_id Columns

keeping only the tweets with user_id in user dataset. As these are the ones we would like to study, and have the data to verify the validity of.

In [38]:
tweets_df.rename(columns= {"id" : "tweet_id"}, inplace=True)

In [39]:
before_dropping_rows_number = len(tweets_df.index)

numeric_ids = pd.to_numeric(tweets_df["user_id"], errors="coerce")
ids_are_not_in_users_df = numeric_ids[numeric_ids.isin(users_df["user_id"]) == False]
tweets_df.drop(ids_are_not_in_users_df.index, inplace=True)

tweets_df["user_id"] = pd.to_numeric(tweets_df["user_id"], errors="coerce")

after_dropping_rows_number = len(tweets_df.index)

print(f"Percentage of tweets whose author id isn't inside the users dataframe: {(100*(before_dropping_rows_number-after_dropping_rows_number))/(before_dropping_rows_number)}")

Percentage of tweets whose author id isn't inside the users dataframe: 3.56021092602426


clean id field by casting to int

In [40]:

tweets_df["tweet_id"] = pd.to_numeric(tweets_df["tweet_id"], errors="coerce") # cast field to int and set invalid values to NaN

removing tweets which are duplicates on every attribute

In [41]:
original_number_rows = len(tweets_df.index)

all_columns_duplicated_df = tweets_df[tweets_df.duplicated(subset=None, keep="first")]
all_columns_duplicated_number = len(all_columns_duplicated_df.index)

print(f"Percentage of tweets duplicated along all the columns that we are deleting(after keeping the first instance): {(100*(all_columns_duplicated_number))/original_number_rows}")

tweets_df.drop(labels=all_columns_duplicated_df.index, inplace=True)

Percentage of tweets duplicated along all the columns that we are deleting(after keeping the first instance): 14.75545529572922


In [42]:
before = tweets_df.size
tweets_df = tweets_df[tweets_df['tweet_id'].notna()]
after= tweets_df.size
print(f"Percentage of tweets with nan id dropped: {(100*(before - after))/before}")


Percentage of tweets with nan id dropped: 3.333923818510375


In [43]:
before = tweets_df.size
tweets_df = tweets_df.drop_duplicates(subset="tweet_id")
after = tweets_df.size
print(f"Percentage of tweets with same id dropped: {(100*(before - after))/before}")

Percentage of tweets with same id dropped: 0.005046422947153362


We have decided to drop all tweets with NaN values as their ID. As it is difficult for us to ensure their validity. We have also decided to drop all tweets with duplicated IDs. Keeping only its first instance. 

As we can see from the data above. Pandas treats tweets with fields containing NaN values as different from each other. Atleast when it comes to ID. 

Lets see if we can find the tweets with real ID values.

In [44]:
before = tweets_df.size
tweets_df = tweets_df[tweets_df['user_id'].notna()]
after= tweets_df.size
print(f"Percentage of tweets with nan user_id dropped: {(100*(before - after))/before}")

Percentage of tweets with nan user_id dropped: 0.0


## 3. Numerical columns

- Retweet_count
- Reply_count
- favorite_count
- Num_hashtags
- Num_urls
- num_mentions

In [45]:
columns = ["retweet_count", "reply_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"]
tweets_df[columns].describe()

Unnamed: 0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions
count,10589137,10480176,10479682,10259406,10479098,10369958
unique,181213,117598,117707,77700,116554,97775
top,0,0,0,0,0,0
freq,7540373,9803698,7866508,8704828,8223780,6116213


The following thresholds are based on the most retweet and "liked" tweets on the platform

In [46]:
thresholds = {
   "retweet_count": 3738380,
   "favorite_count": 7114892
}

print("Number of invalid values for the following columns:")
for column_name in columns:
   # casting all the columns to numeric (and setting invalid value to null)
   tweets_df[column_name] = pd.to_numeric(tweets_df[column_name], errors="coerce")

   print(f"{column_name}")
   # evaluating the presence of negative values
   negative_series = tweets_df[tweets_df[column_name] < 0][column_name]
   print(f"\tnegative: {negative_series.size}")
   tweets_df[column_name].loc[negative_series.index] = np.NaN # setting negative values to NaN
   
   # evaluating the presence of inf values
   inf_series = utils.get_inf_elements(tweets_df[column_name])
   print(f"\tinf: {inf_series.size}")
   tweets_df[column_name].loc[inf_series.index] = np.NaN # setting inf values to NaN

   # evaluating the presence of decimal values
   is_float = lambda n: (not pd.isna(n)) and (not math.isinf(n)) and (not n.is_integer())
   decimal_values_series =  tweets_df[column_name].apply(is_float)
   print(f"\tdecimals: {tweets_df[column_name][decimal_values_series].size}")
   
   # finding values above given thresholds (if specified)
   if column_name in thresholds:
       threshold = thresholds[column_name]
       above_threshold_series = tweets_df[tweets_df[column_name] > threshold ][column_name]
       print(f"\tvalues above threshold ({threshold}): {above_threshold_series[above_threshold_series > threshold].size}") 
       tweets_df[column_name].loc[above_threshold_series.index] = np.NaN # setting values above threshold to nan



Number of invalid values for the following columns:
retweet_count
	negative: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df[column_name].loc[negative_series.index] = np.NaN # setting negative values to NaN


	inf: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df[column_name].loc[inf_series.index] = np.NaN # setting inf values to NaN


	decimals: 0
	values above threshold (3738380): 36


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df[column_name].loc[above_threshold_series.index] = np.NaN # setting values above threshold to nan


reply_count
	negative: 0
	inf: 1
	decimals: 0
favorite_count
	negative: 1
	inf: 0
	decimals: 0
	values above threshold (7114892): 34
num_hashtags
	negative: 0
	inf: 0
	decimals: 0
num_urls
	negative: 0
	inf: 0
	decimals: 0
num_mentions
	negative: 0
	inf: 0
	decimals: 0


The first observation that can be made is that \*_count and num_\* fields should contain only positive integers. Something that they do.
We find some inf values in the following columns:
- retweet_count
- reply_count

We also find values that are above our treshold range: 
- 36 tweets above the retweet_count treshold
- 34 tweets above the favorite_count treshold


All these values and the infinate values are set to NaN.


In [47]:

# tweets_df[columns] = tweets_df[columns].apply(pd.to_numeric, errors="coerce")
# temp = tweets_df._get_numeric_data()
# temp[temp<0].sum
# temp[temp < 0] = np.nan
# temp.head()
 

## 9. Created_at Column

created_at should be datetime

Checks if all the tweets were created after the first tweet published on twitter (so we don't have something strange like a tweet created in 01-01-1990)

In [48]:
tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"], errors="coerce")

# finding tweets created before twitter first tweet
before_time_tweets_df = tweets_df[tweets_df["created_at"] < datetime(2006,3,21,12,50,0)]
before_time_tweets_df.info()

# finding tweets created after dataset release
before_time_tweets_df = tweets_df[tweets_df["created_at"] > datetime(2022,9,29,11,0,0)]
before_time_tweets_df.info()

# what to do with these tweets? I suggest dropping/removing them.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48797 entries, 223 to 13664472
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   tweet_id        48797 non-null  float64       
 1   user_id         48797 non-null  int64         
 2   retweet_count   48797 non-null  float64       
 3   reply_count     48797 non-null  float64       
 4   favorite_count  48797 non-null  float64       
 5   num_hashtags    48797 non-null  float64       
 6   num_urls        48797 non-null  float64       
 7   num_mentions    48797 non-null  float64       
 8   created_at      48797 non-null  datetime64[ns]
 9   text            48594 non-null  object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 4.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 48801 entries, 342 to 13664679
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------    

In [49]:
tweets_df.info()
tweets_df = tweets_df[tweets_df["created_at"] > datetime(2006,3,21,12,50,0)]

# finding tweets created after dataset release
tweets_df = tweets_df[tweets_df["created_at"] < datetime(2022,9,29,11,0,0)]
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10858629 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   tweet_id        float64       
 1   user_id         int64         
 2   retweet_count   float64       
 3   reply_count     float64       
 4   favorite_count  float64       
 5   num_hashtags    float64       
 6   num_urls        float64       
 7   num_mentions    float64       
 8   created_at      datetime64[ns]
 9   text            object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 1.1+ GB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10761031 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   tweet_id        float64       
 1   user_id         int64         
 2   retweet_count   float64       
 3   reply_count     float64       
 4   favorite_count  float64       
 5   

dropped the tweets that were out of the specified time range.

## 10. Text Column

In [50]:
tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 
tweets_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10761031 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype         
---  ------          --------------     -----         
 0   tweet_id        10761031 non-null  float64       
 1   user_id         10761031 non-null  int64         
 2   retweet_count   10353250 non-null  float64       
 3   reply_count     10263418 non-null  float64       
 4   favorite_count  10263950 non-null  float64       
 5   num_hashtags    10083394 non-null  float64       
 6   num_urls        10263312 non-null  float64       
 7   num_mentions    10173425 non-null  float64       
 8   created_at      10761031 non-null  datetime64[ns]
 9   text            10460639 non-null  object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 2.2 GB


Unnamed: 0,tweet_id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions
count,10761030.0,10761030.0,10353250.0,10263420.0,10263950.0,10083390.0,10263310.0,10173420.0
mean,inf,830402000.0,576.015,1.948669e+103,3.587553,6.9421070000000005e+146,9.743443e+202,5.897719e+80
std,,852111300.0,14266.71,6.242866e+106,3130.08,2.204422e+150,inf,1.8811250000000001e+84
min,346.0,678033.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.614611e+17,237178000.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.243933e+17,468066200.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.773086e+17,1200888000.0,1.0,0.0,0.0,0.0,0.0,1.0
max,inf,3164942000.0,3350111.0,2e+110,7000000.0,7e+153,1e+210,6e+87


Removing tweets with null text field, float text field or only spaces, because these are not allowed by twitter

In [51]:
tweets_df.dropna(subset=["text"], inplace=True) # drop the tweets where the text field is null
tweets_df["text"] = tweets_df["text"].astype(str) # cast the text field to string
tweets_df = tweets_df[~tweets_df.text.str.isspace()]

tweets_df.info(verbose=True, show_counts=True, memory_usage= "deep") 
tweets_df.describe()
# Problem with memory usage, we have to find a more effient way to remove tweets with only spaces
"""
to_drop = []
for index, row in tweets_df.iterrows():
    if row["text"].isspace():
        tweets_df.drop(index)
"""

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10460638 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype         
---  ------          --------------     -----         
 0   tweet_id        10460638 non-null  float64       
 1   user_id         10460638 non-null  int64         
 2   retweet_count   10133485 non-null  float64       
 3   reply_count     10133552 non-null  float64       
 4   favorite_count  10044292 non-null  float64       
 5   num_hashtags    10044031 non-null  float64       
 6   num_urls        10133536 non-null  float64       
 7   num_mentions    10133692 non-null  float64       
 8   created_at      10460638 non-null  datetime64[ns]
 9   text            10460638 non-null  object        
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 2.2 GB


Unnamed: 0,tweet_id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions
count,10460640.0,10460640.0,10133480.0,10133550.0,10044290.0,10044030.0,10133540.0,10133690.0
mean,inf,829202500.0,576.8267,1.9736420000000002e+103,3.539899,6.969313000000001e+146,4.934112000000001e+167,5.920843e+80
std,,851355900.0,14166.49,6.282741e+106,3163.019,2.208737e+150,inf,1.884809e+84
min,346.0,678033.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.680207e+17,236449600.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.257476e+17,468064200.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.777142e+17,1199715000.0,1.0,0.0,0.0,0.0,0.0,1.0
max,inf,3164942000.0,3350111.0,2e+110,7000000.0,7e+153,5e+174,6e+87


'\nto_drop = []\nfor index, row in tweets_df.iterrows():\n    if row["text"].isspace():\n        tweets_df.drop(index)\n'

### Distribution of variables and statistics
Let's study them!

Histograms for numerical fields