## Imports

### Imports

In [1]:
import pandas as pd
from datetime import datetime

### Load Dataset

In [2]:
# Use low_memory=False to prevent dtype inference problems
chunk = pd.read_csv('../data/bitcoin_tweets_original.csv',chunksize=100000,lineterminator='\n',low_memory=False)
df = pd.concat(chunk)

## Dataset info

In [3]:
df.shape

(4689354, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689354 entries, 0 to 4689353
Data columns (total 13 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_name         object
 1   user_location     object
 2   user_description  object
 3   user_created      object
 4   user_followers    object
 5   user_friends      object
 6   user_favourites   object
 7   user_verified     object
 8   date              object
 9   text              object
 10  hashtags          object
 11  source            object
 12  is_retweet        object
dtypes: object(13)
memory usage: 465.1+ MB


In [5]:
df.describe()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
count,4689246,2352595,4169415,4689354,4689354.0,4689354.0,4689354.0,4689354,4689354,4689354,4671838,4685271,4688536
unique,653023,100600,683982,620891,92485.0,34319.0,163739.0,70,3668409,4570199,1013167,3108,1
top,Live Price Crypto,United States,UP or DOWN...\n.\n.\n.\n.\nPrice matters NOT.,2022-03-10 14:06:46,0.0,0.0,0.0,False,2022-05-31 06:02:30,💸 Earn free #BTC and multiply crypto up to 15%...,['Bitcoin'],Twitter for Android,False
freq,41701,63086,29410,43465,60579.0,115000.0,181174.0,3066892,373,1351,608574,1281003,4688536


In [13]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Cleaning dataset

In [6]:
df.isnull().sum() 

user_name               108
user_location       2336759
user_description     519939
user_created              0
user_followers            0
user_friends              0
user_favourites           0
user_verified             0
date                      0
text                      0
hashtags              17516
source                 4083
is_retweet              818
dtype: int64

#### There is no null for `date` and `text` columns

In [7]:
df[df.duplicated(keep=False)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet


#### No duplicated records

### validate date

In [8]:
def validate_date(date_text):
    errors = []
    for i,v in enumerate(date_text):
        try:
            datetime.strptime(v, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            errors.append(i)
    return errors

In [15]:
errors = validate_date(df['date'])
print('There are '+ str(len(errors)) + ' invalid dates in the date column')

There are 66 invalid dates in the date column


In [10]:
# remove invalid dates
df_date_clean = df.loc[~df.index.isin(errors)]
df_date_clean.reset_index(drop=True,inplace=True)
df_date_clean.shape

(4689288, 13)

In [18]:
# Get the minimum date
min_date = df_date_clean['date'].min()

# Get the maximum date
max_date = df_date_clean['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-02-05 10:52:04
Maximum Date: 2023-01-09 23:59:54
