# Data preprocessing 

In [57]:
import os
import json
import gzip
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from urllib.request import urlopen
from numpy.linalg import norm
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

# default plot configurations 
%matplotlib inline 
plt.rcParams['figure.figsize'] = (16,8)
plt.rcParams['figure.dpi'] = 150
sns.set()

## Start loading data

In [61]:
data = json.load(open('dataset/nft_tweets.json'))

In [71]:
df = pd.read_json('dataset/nft_tweets.json')

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

### Load author data

In [62]:
author_data = pd.DataFrame(data["data"])
author_data.groupby("author_id")
author_data = author_data.drop(columns="geo")
author_data

Unnamed: 0,public_metrics,text,created_at,id,author_id
0,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",Yet not a single app works.\n\nWE NEED BADLY a...,2022-04-28T00:29:47.000Z,1519473920704983042,1181641903009341440
1,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@binance CYLUM $ project will be one of the mo...,2022-04-28T00:29:21.000Z,1519473810793242626,1441693405583593478
2,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@SenorAxie @CryptoGeekzNFT @PickwinkangNFT @gu...,2022-04-28T00:29:09.000Z,1519473760273039360,3008000582
3,"{'retweet_count': 0, 'reply_count': 3, 'like_c...",Big yikes if these AI platforms are rememberin...,2022-04-28T00:28:46.000Z,1519473664294535170,1208948613189160961
4,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Put your NFT's on Public Display..\nAnywhere.....,2022-04-28T00:28:33.000Z,1519473608237662210,2542802983
...,...,...,...,...,...
28728,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",👉 I just joined the movement! #BlueZilla's mak...,2022-05-03T23:49:34.000Z,1521638124455145472,755561434931572737
28729,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",👉 I just joined the movement! #BlueZilla's mak...,2022-05-03T23:49:24.000Z,1521638083749453825,1464617797011623939
28730,"{'retweet_count': 7, 'reply_count': 6, 'like_c...",FairLaunch coming soon at Pinksale!\n\nStart: ...,2022-05-03T23:49:13.000Z,1521638038568411137,1519004956136816642
28731,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",The only NFT collection that gives you access ...,2022-05-03T23:49:06.000Z,1521638009032093696,1517595559011528705


### Follower data

In [69]:
follower_data_raw = data["includes"]
follower_data_raw

{'users': [{'id': '1181641903009341440',
   'public_metrics': {'followers_count': 509,
    'following_count': 512,
    'tweet_count': 39015,
    'listed_count': 6},
   'username': 'cryptoken_board',
   'name': 'cryptoken board ℠'},
  {'id': '1441693405583593478',
   'public_metrics': {'followers_count': 5,
    'following_count': 56,
    'tweet_count': 414,
    'listed_count': 3},
   'username': 'AliGhiasvand77',
   'name': 'Anonymous boy'},
  {'id': '3008000582',
   'public_metrics': {'followers_count': 556,
    'following_count': 1320,
    'tweet_count': 10098,
    'listed_count': 2},
   'username': 'Jeruk4444',
   'name': '█║Nash║▌🌟'},
  {'id': '1208948613189160961',
   'public_metrics': {'followers_count': 477,
    'following_count': 74,
    'tweet_count': 335,
    'listed_count': 3},
   'username': 'BeckerrJon',
   'name': 'Jonathan Becker | jbecker.eth'},
  {'id': '2542802983',
   'public_metrics': {'followers_count': 433,
    'following_count': 3641,
    'tweet_count': 6029,
    

In [70]:
follower_data = pd.DataFrame(follower_data_raw["users"])
# follower_data.groupby("author_id")
# follower_data = follower_data.drop(columns="geo")
follower_data

Unnamed: 0,id,public_metrics,username,name
0,1181641903009341440,"{'followers_count': 509, 'following_count': 51...",cryptoken_board,cryptoken board ℠
1,1441693405583593478,"{'followers_count': 5, 'following_count': 56, ...",AliGhiasvand77,Anonymous boy
2,3008000582,"{'followers_count': 556, 'following_count': 13...",Jeruk4444,█║Nash║▌🌟
3,1208948613189160961,"{'followers_count': 477, 'following_count': 74...",BeckerrJon,Jonathan Becker | jbecker.eth
4,2542802983,"{'followers_count': 433, 'following_count': 36...",DaLeewoke,DaLee Woke
...,...,...,...,...
15299,755561434931572737,"{'followers_count': 7, 'following_count': 65, ...",omar_fahimi99,mr zbida
15300,1464617797011623939,"{'followers_count': 0, 'following_count': 11, ...",WirghtElizabeth,Wirght Elizabeth
15301,1519004956136816642,"{'followers_count': 1620, 'following_count': 8...",MoveningE,Movening Official
15302,1517595559011528705,"{'followers_count': 6463, 'following_count': 2...",tradebots_sol,Trade Bots
