## Checking the error Data for 2021 Jan and May

In [72]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime
import seaborn as sns

In [2]:
!pip install torch



In [3]:
!pip install protobuf==3.20.0



## Step 1 - Importing Bitcoin Tweet Data to begin analysing the model

In [40]:
data = pd.read_csv('../data/processed_data/processed_experts_tweets.csv')

## Step 2 - Cleaning the data

In [9]:
def preprocess(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [10]:
#use the preprocess to clean the data
data['process_text'] = data['text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_bert'] = data.text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['neutral_bert'] = data.text


## Step 3 - analyse the language and sentiments by pretrained model

In [12]:
!pyenv local crypto

In [13]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
tokenizer_bert.model_max_length = 512 #solve the error: RuntimeError: The expanded size of the tensor (562) must match the existing size (514) at non-singleton dimension
config_bert = AutoConfig.from_pretrained(MODEL_bert)


# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
model_bert.config.max_position_embeddings = 512

In [14]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert

In [41]:
data['text'] = data.text.apply(lambda x: x[1:-1].split())

In [42]:
data

Unnamed: 0.1,Unnamed: 0,datetime,username,text,process_text
0,0,2023-04-03 12:15:20+00:00,saylor,"[0.37983057, 0.24003501, 0.38013434]",#Bitcoin is Digital Energy.
1,1,2023-04-02 12:59:03+00:00,saylor,"[0.92297596, 0.03597098, 0.04105297]",The Robots will only take #Bitcoin.
2,2,2023-04-01 21:59:22+00:00,saylor,"[0.03105829, 0.37485844, 0.5940833]",Gen Z Analyst elegantly articulates why #Bitco...
3,3,2023-04-01 16:45:59+00:00,saylor,"[0.10731032, 0.44189024, 0.45079947]",#Bitcoin is the Superior Asset.
4,4,2023-04-01 12:40:19+00:00,saylor,"[0.9573283, 0.01798436, 0.02468747]",Happy #Bitcoin Fools Day.
...,...,...,...,...,...
27450,27450,2021-04-30 03:56:27+00:00,RektProof,"[0.46553522, 0.43029073, 0.10417402]",$BTC / $USD 2 Different ranges here LTF/HTF ...
27451,27451,2021-04-26 15:03:52+00:00,RektProof,"[0.04189492, 0.8594566, 0.09864834]",Pretty good start to the year overall. 30R in...
27452,27452,2021-04-24 06:23:55+00:00,RektProof,"[0.9676184, 0.02592494, 0.00645662]",Power of 3 Another Round? Old low broken and...
27453,27453,2021-04-19 15:48:27+00:00,RektProof,"[0.11949915, 0.7909914, 0.08950952]",$BTC / $USD Looking for a move into 60-62k th...


In [21]:
starting_num = 6650
ending_num = 6652

for i in range(25):
    try:
        data['process_text'][starting_num : ending_num].apply(scores_bert)
        print(f'{starting_num} to {ending_num} is going to work!')
        working = True
    except:
        print(f'{starting_num} to {ending_num} has not processed correctly')
    if working == True:
        result_csv = pd.DataFrame(data['process_text'][starting_num : ending_num])
        result_csv['text'] = result_csv['process_text'].apply(scores_bert)
        print(f'{starting_num} to {ending_num} has worked!')
        file_name = f"{starting_num}_{ending_num}_twitter_comments.csv"
        result_csv.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")
        print(f'{starting_num} to {ending_num} saved!')
    starting_num += 2
    ending_num += 2

6650 to 6652 is going to work!
6650 to 6652 has worked!
6650 to 6652 saved!
6652 to 6654 is going to work!
6652 to 6654 has worked!
6652 to 6654 saved!
6654 to 6656 is going to work!
6654 to 6656 has worked!
6654 to 6656 saved!
6656 to 6658 is going to work!
6656 to 6658 has worked!
6656 to 6658 saved!
6658 to 6660 is going to work!
6658 to 6660 has worked!
6658 to 6660 saved!
6660 to 6662 is going to work!
6660 to 6662 has worked!
6660 to 6662 saved!
6662 to 6664 is going to work!
6662 to 6664 has worked!
6662 to 6664 saved!
6664 to 6666 is going to work!
6664 to 6666 has worked!
6664 to 6666 saved!
6666 to 6668 is going to work!
6666 to 6668 has worked!
6666 to 6668 saved!
6668 to 6670 is going to work!
6668 to 6670 has worked!
6668 to 6670 saved!
6670 to 6672 is going to work!
6670 to 6672 has worked!
6670 to 6672 saved!
6672 to 6674 is going to work!
6672 to 6674 has worked!
6672 to 6674 saved!
6674 to 6676 is going to work!
6674 to 6676 has worked!
6674 to 6676 saved!
6676 to 6678

RuntimeError: The expanded size of the tensor (672) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 672].  Tensor sizes: [1, 514]

In [43]:
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text
for i in range(len(data)):    
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_bert'][i] = data['text'][i][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['neutral_bert'][i] = data['text'][i][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_bert'][i] = data['text'][i][2]


In [44]:
data = data[['datetime','username', 'process_text', 'negative_bert', 'neutral_bert','positive_bert']]

In [49]:
data['datetime'] = pd.to_datetime(data['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['datetime'] = pd.to_datetime(data['datetime'])


In [47]:
data.to_csv('../data/processed_data/processed_experts_tweets_with_sentiment.csv')

## Step 4: Count the total number of labels/scores (positive, negative vs neutral) by date

In [54]:
data['datetime'][0].date()

datetime.date(2023, 4, 3)

In [57]:
data['date'] = data['datetime'].apply(lambda x: x.date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = data['datetime'].apply(lambda x: x.date())


In [64]:
data['negative_bert'] = data['negative_bert'].astype(float)
data['neutral_bert']= data['neutral_bert'].astype(float)
data['positive_bert']= data['positive_bert'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['negative_bert'] = data['negative_bert'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['neutral_bert']= data['neutral_bert'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positive_bert']= data['positive_bert'].astype(float)


In [66]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")

In [70]:
grouped_data['quantity'] = grouped_data['neutral_bert'] + grouped_data['positive_bert'] + grouped_data['negative_bert']

In [74]:
grouped_data.to_csv('../data/processed_data/processed_experts_tweets_grouped.csv')
data.to_csv('../data/processed_data/processed_experts_tweets_with_sentiment.csv')

In [81]:
usernames = data['username'].value_counts()

In [87]:
usernames= usernames.reset_index()

In [107]:
usernames
mask = usernames['username']>250
club_250 = usernames[mask]

In [101]:
club_250 = club_250['index'].unique()

In [108]:
club_250

Unnamed: 0,index,username
0,udiWertheimer,3987
1,PeterMcCormack,2453
2,saylor,1988
3,scottmelker,1978
4,CryptoKaleo,1249
5,notsofast,1032
6,100trillionUSD,756
7,jebus911,708
8,mattomattik,654
9,CryptoWendyO,652


In [109]:
club_250.to_csv('../data/processed_data/users_over_250_tweets.csv')

## Step 5 - Download the data

In [209]:
start_date = grouped_data['date'][0]
# start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date

'2021-12-01'

In [210]:
end_date = grouped_data['date'][len(grouped_data)-1]
# end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date

'2021-12-31'

In [211]:
file_name = f"{start_date}_{end_date}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [212]:
file_name = f"{start_date}_{end_date}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")

In [84]:
test = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweets_202102_reduced.csv')
test

Unnamed: 0.1,Unnamed: 0,datetime,username,text,has_#bitcoin,date
0,14003,2021-02-01 22:09:58+00:00,mferreira680,Wohhhh is great!!!!! \nPls give some suvenir ...,True,2021-02-01
1,14004,2021-02-01 22:09:33+00:00,mbs_FiTTrader,Well @elonmusk made a joke about #dodgecoin an...,True,2021-02-01
2,14005,2021-02-01 22:09:32+00:00,oneredwanonly,It's about inequality and unfair markets. Taxe...,True,2021-02-01
3,14006,2021-02-01 22:09:26+00:00,bmurphypointman,#affiliatemarketing #affiliate #travel #touris...,True,2021-02-01
4,14007,2021-02-01 22:09:25+00:00,bmurphypointman,#tumblr #twitter #gifting #facebook #instagram...,True,2021-02-01
...,...,...,...,...,...,...
16214,30217,2021-02-28 06:00:00+00:00,tulipan81,Bitcoin mined blocks distribution per pool in ...,True,2021-02-28
16215,30218,2021-02-28 06:00:00+00:00,tulipan81,Bitcoin free space in mined blocks in last wee...,True,2021-02-28
16216,30219,2021-02-28 06:00:00+00:00,tulipan81,Bitcoin free space in mined blocks in last day...,True,2021-02-28
16217,30220,2021-02-28 06:00:00+00:00,tulipan81,Bitcoin mined blocks distribution per pool in ...,True,2021-02-28


In [30]:
test = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweets_202105_reduced.csv')

In [31]:
test[6690:6691]

Unnamed: 0.1,Unnamed: 0,datetime,username,text,has_#bitcoin,date
6690,68375,2021-05-14 14:07:07+00:00,Titeeeb,We can do this army shiba 🏳️‍🌈🏳️‍⚧️🇺🇳🇦🇫🇿🇦🇦🇴🇦🇩🇩...,True,2021-05-14


In [33]:
test.drop([6690])[6685:6695]

Unnamed: 0.1,Unnamed: 0,datetime,username,text,has_#bitcoin,date
6685,68370,2021-05-14 14:07:50+00:00,SoodiniR,Millions of people across the world living in ...,True,2021-05-14
6686,68371,2021-05-14 14:07:46+00:00,dotheextra,Locked in my profit from ADA ✅ #btc #bitcoin h...,True,2021-05-14
6687,68372,2021-05-14 14:07:36+00:00,RTANGEL11,@Yamiche The Gops attempting to use stefanik t...,True,2021-05-14
6688,68373,2021-05-14 14:07:31+00:00,Dark_Ages_2020,#cryptocurrency mining is swallowing up the su...,True,2021-05-14
6689,68374,2021-05-14 14:07:29+00:00,5Ksana,@NickyPapersNFT My cat Leo loves #bitcoin more...,True,2021-05-14
6691,68376,2021-05-14 14:07:02+00:00,realLooms,@defichain is a #bitcoin fork that doesn't us...,True,2021-05-14
6692,68377,2021-05-14 14:06:56+00:00,terr547,@girlgone_crypto And #bitcoin still barely abo...,True,2021-05-14
6693,68378,2021-05-14 14:06:54+00:00,achim1235,@AltcoinDailyio Check out the one and only „De...,True,2021-05-14
6694,68379,2021-05-14 14:06:51+00:00,Bouillaka_CF,Do you think @elonmusk bought the dip in secre...,True,2021-05-14
6695,68380,2021-05-14 14:06:42+00:00,Dr_CoreyWilson,@DocumentingBTC If #Bitcoin is really so bad f...,True,2021-05-14
