# Data preprocessing 

In [53]:
import os
import json
import gzip
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from urllib.request import urlopen
from numpy.linalg import norm
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

# default plot configurations 
%matplotlib inline 
plt.rcParams['figure.figsize'] = (16,8)
plt.rcParams['figure.dpi'] = 150
sns.set()

## Start loading data

The data we retrived is consisted with 6 days tweets data which is sampled every 30 minutes each day, from April 28th to May 3rd. Meta here stands for the overview snapshot for our whole data set. 

In [70]:
data = json.load(open('dataset/nft_tweets.json'))
data

{'data': [{'public_metrics': {'retweet_count': 0,
    'reply_count': 2,
    'like_count': 1,
    'quote_count': 0},
   'text': 'Yet not a single app works.\n\nWE NEED BADLY a Music NFT app supporting independent artists. \n\n#NFT #crypto #cryptocurrency #cryptocurrencies #blockchain https://t.co/LE2q3nMQks',
   'created_at': '2022-04-28T00:29:47.000Z',
   'id': '1519473920704983042',
   'author_id': '1181641903009341440'},
  {'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0},
   'text': '@binance CYLUM $ project will be one of the most powerful projects. Take a look at this active and powerful project and see what powerful owners and investors it has found during these two months. Be sure to buy CYLUM $ and guarantee your future.\n\n #CZBinance #ElonMusk #NFT #Crypto',
   'created_at': '2022-04-28T00:29:21.000Z',
   'id': '1519473810793242626',
   'author_id': '1441693405583593478'},
  {'public_metrics': {'retweet_count': 0,
    're

In [71]:
data_overview = pd.json_normalize(data)
data_overview.head()

Unnamed: 0,data,includes.users,meta.newest_id,meta.oldest_id,meta.result_count,meta.next_token
0,"[{'public_metrics': {'retweet_count': 0, 'repl...","[{'id': '1181641903009341440', 'public_metrics...",1519473920704983042,1519468776202194945,100,b26v89c19zqg8o3fpytot5bz2omnsan5sm6u283nf87zx


### Load author data

The way we arrange our data is: we have the author information for each tweet, which contains the author's id, the time when this tweet is created, the number of total tweets this author has published so far(public_metrics.retweet_count), the number of replies this author has sent so far(public_metrics.reply_count), the number of likes from other users for this tweet(public_metrics.like_count), and also number of quotes for this tweet(public_metrics.like_count). We have author is for each tweet, and its own id is shown as 'id'. Text stands for the content of this tweet. 

In [55]:
author_data = pd.json_normalize(data,record_path = ['data'])
author_data.drop(columns=['geo.place_id','geo.coordinates.type','geo.coordinates.coordinates'],inplace=True)
author_data.head()

Unnamed: 0,text,created_at,id,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440,0,2,1,0
1,@binance CYLUM $ project will be one of the mo...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478,0,0,0,0
2,@SenorAxie @CryptoGeekzNFT @PickwinkangNFT @gu...,2022-04-28T00:29:09.000Z,1519473760273039360,3008000582,0,0,1,0
3,Big yikes if these AI platforms are rememberin...,2022-04-28T00:28:46.000Z,1519473664294535170,1208948613189160961,0,3,5,0
4,Put your NFT's on Public Display..\nAnywhere.....,2022-04-28T00:28:33.000Z,1519473608237662210,2542802983,0,0,1,0


### Follower data

We have also collected detailed data about numebr of fans, number of following counts, number of tweets of each user. There data can be helpful when we analysis the influences of social media has for the NFT market price. 

In [56]:
follower_data_raw = data["includes"]["users"]
follower_data_raw

[{'id': '1181641903009341440',
  'public_metrics': {'followers_count': 509,
   'following_count': 512,
   'tweet_count': 39015,
   'listed_count': 6},
  'username': 'cryptoken_board',
  'name': 'cryptoken board ℠'},
 {'id': '1441693405583593478',
  'public_metrics': {'followers_count': 5,
   'following_count': 56,
   'tweet_count': 414,
   'listed_count': 3},
  'username': 'AliGhiasvand77',
  'name': 'Anonymous boy'},
 {'id': '3008000582',
  'public_metrics': {'followers_count': 556,
   'following_count': 1320,
   'tweet_count': 10098,
   'listed_count': 2},
  'username': 'Jeruk4444',
  'name': '█║Nash║▌🌟'},
 {'id': '1208948613189160961',
  'public_metrics': {'followers_count': 477,
   'following_count': 74,
   'tweet_count': 335,
   'listed_count': 3},
  'username': 'BeckerrJon',
  'name': 'Jonathan Becker | jbecker.eth'},
 {'id': '2542802983',
  'public_metrics': {'followers_count': 433,
   'following_count': 3641,
   'tweet_count': 6029,
   'listed_count': 11},
  'username': 'DaLeew

This function is used to solve nested sturcture in our dataset. 

In [57]:
def dropnested(alist):
    outputdict = {}
    for dic in alist:
        for key, value in dic.items():
            if isinstance(value, dict):
                for k2, v2, in value.items():
                    outputdict[k2] = outputdict.get(k2, []) + [v2]
            else:
                outputdict[key] = outputdict.get(key, []) + [value]
    return outputdict   

Unnamed: 0,id,followers_count,following_count,tweet_count,listed_count,username,name
0,1181641903009341440,509,512,39015,6,cryptoken_board,cryptoken board ℠
1,1441693405583593478,5,56,414,3,AliGhiasvand77,Anonymous boy
2,3008000582,556,1320,10098,2,Jeruk4444,█║Nash║▌🌟
3,1208948613189160961,477,74,335,3,BeckerrJon,Jonathan Becker | jbecker.eth
4,2542802983,433,3641,6029,11,DaLeewoke,DaLee Woke


In [72]:
follower_data = pd.DataFrame.from_dict(dropnested(follower_data_raw))
follower_data.head()

Unnamed: 0,id,followers_count,following_count,tweet_count,listed_count,username,name
0,1181641903009341440,509,512,39015,6,cryptoken_board,cryptoken board ℠
1,1441693405583593478,5,56,414,3,AliGhiasvand77,Anonymous boy
2,3008000582,556,1320,10098,2,Jeruk4444,█║Nash║▌🌟
3,1208948613189160961,477,74,335,3,BeckerrJon,Jonathan Becker | jbecker.eth
4,2542802983,433,3641,6029,11,DaLeewoke,DaLee Woke


### Join two tables together 

Now, we combine these two dataframe together based on tweets' id to have a completed view of the whole dataset. 

In [64]:
complete_data = pd.merge(author_data, follower_data, on='id', how='outer')
complete_data = complete_data.fillna(0)
complete_data['text'] = complete_data['text'].astype('str')
complete_data.head()

Unnamed: 0,text,created_at,id,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,followers_count,following_count,tweet_count,listed_count,username,name
0,Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
1,@binance CYLUM $ project will be one of the mo...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,@SenorAxie @CryptoGeekzNFT @PickwinkangNFT @gu...,2022-04-28T00:29:09.000Z,1519473760273039360,3008000582,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
3,Big yikes if these AI platforms are rememberin...,2022-04-28T00:28:46.000Z,1519473664294535170,1208948613189160961,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0
4,Put your NFT's on Public Display..\nAnywhere.....,2022-04-28T00:28:33.000Z,1519473608237662210,2542802983,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0


### Process twitter text 

The tweets data we directly fetch from social media may contain invalid format for our sentiment analysis model. Therefore, we should do some preprocessing on tweets' contents. 

In [66]:
import re # regular expressions
from tqdm import tnrange, tqdm_notebook, tqdm

for i,s in enumerate(tqdm(complete_data['text'])):
    text = complete_data.loc[i, 'text']
    text = text.replace("#", "")
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text, flags=re.MULTILINE)
    text = re.sub('@\\w+ *', '', text, flags=re.MULTILINE)
    complete_data.loc[i, 'text'] = text

complete_data.head()


100%|████████████████████████████████████| 44037/44037 [01:25<00:00, 516.44it/s]


Unnamed: 0,text,created_at,id,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,followers_count,following_count,tweet_count,listed_count,username,name,Text
0,Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,Yet not a single app works.\n\nWE NEED BADLY a...
1,CYLUM $ project will be one of the most powerf...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,CYLUM $ project will be one of the most powerf...
2,\n\nElom musk Buy twitter its signal if Crypto...,2022-04-28T00:29:09.000Z,1519473760273039360,3008000582,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,\n\nElom musk Buy twitter its signal if Crypto...
3,Big yikes if these AI platforms are rememberin...,2022-04-28T00:28:46.000Z,1519473664294535170,1208948613189160961,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0,Big yikes if these AI platforms are rememberin...
4,Put your NFT's on Public Display..\nAnywhere.....,2022-04-28T00:28:33.000Z,1519473608237662210,2542802983,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,Put your NFT's on Public Display..\nAnywhere.....


### Reformat created date data

We also collect NFT market price data from https://www.binance.com/en/price/nft, however, the timestamp from this website has a different format from the created date data in our twitter dataset. To be more detailed, it doesn't  accurate to the second. Therefore, we will reformat out creat date here to prepare for our correlation analysis procedure.  

### Sentiment analysis with Vader

In [74]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tnrange, tqdm_notebook, tqdm

In [75]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(complete_data['text'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
complete_data["compound"] = compound
complete_data.head(2)

100%|██████████████████████████████████| 44037/44037 [00:04<00:00, 10110.06it/s]


Unnamed: 0,text,created_at,id,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,followers_count,following_count,tweet_count,listed_count,username,name,Text,compound
0,Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,Yet not a single app works.\n\nWE NEED BADLY a...,-0.2342
1,CYLUM $ project will be one of the most powerf...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,CYLUM $ project will be one of the most powerf...,0.9339


### Calculate a score for each tweet

The compound score for each tweet can only shows the result of sentiment analysis. However, we are talking about the influence from social media on NFT market price, which means that different user may have different influences. We should also consider number of followrs of a author, number of likes received for each tweet, as well as number of retweets. We also would like to know the differences between these parameters, and the total influences they made together. In this part, we computing different scores based on several parameters.

In [79]:
score1,score2,score3, score4 = [],[],[],[]
for i, s in tqdm(complete_data.iterrows(), total=complete_data.shape[0]):
    score1.append(s["compound"] * ((s["public_metrics.like_count"]+1))) # consider number of likes
    score2.append(s["compound"] * ((s["followers_count"]+1))) # consider number of followers
    score3.append(s["compound"] * ((s["public_metrics.retweet_count"]+1))) # consider number of retweets 
    score4.append(s["compound"] * ((s["followers_count"]+1)) * ((s["public_metrics.like_count"]+1)) * ((s["public_metrics.retweet_count"]+1))) # consider all of these 
    
complete_data["like_socre"] = score1
complete_data["follower_socre"] = score2 
complete_data["retweet_socre"] = score3
complete_data["overall_socre"] = score4
complete_data.head()

100%|██████████████████████████████████| 44037/44037 [00:04<00:00, 10905.42it/s]


Unnamed: 0,text,created_at,id,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,followers_count,following_count,tweet_count,listed_count,username,name,Text,compound,like_socre,follower_socre,retweet_socre,overall_socre
0,Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,Yet not a single app works.\n\nWE NEED BADLY a...,-0.2342,-0.4684,-0.2342,-0.2342,-0.4684
1,CYLUM $ project will be one of the most powerf...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,CYLUM $ project will be one of the most powerf...,0.9339,0.9339,0.9339,0.9339,0.9339
2,\n\nElom musk Buy twitter its signal if Crypto...,2022-04-28T00:29:09.000Z,1519473760273039360,3008000582,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,\n\nElom musk Buy twitter its signal if Crypto...,0.7003,1.4006,0.7003,0.7003,1.4006
3,Big yikes if these AI platforms are rememberin...,2022-04-28T00:28:46.000Z,1519473664294535170,1208948613189160961,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0,Big yikes if these AI platforms are rememberin...,0.0,0.0,0.0,0.0,0.0
4,Put your NFT's on Public Display..\nAnywhere.....,2022-04-28T00:28:33.000Z,1519473608237662210,2542802983,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,Put your NFT's on Public Display..\nAnywhere.....,0.0,0.0,0.0,0.0,0.0


### Crypto currency data preprocessing 

### Correlation analysis between the NFT market price and Twitter

### Prediction model 