In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
pd.__version__ 

'1.3.5'

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Mount Google Drive

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [6]:
# 2.1 Check FDBMS Final Project
!ls /content/drive/MyDrive/FDBMS-Final-project

2018-2021-Comments_Instagram_AmberHeard_Data.csv  IDNodes.csv
Edges.csv					  UserNodes.csv


In [7]:
# Change current directory
path = "/content/drive/MyDrive/FDBMS-Final-project"
os.chdir(path)
os.listdir()

['2018-2021-Comments_Instagram_AmberHeard_Data.csv',
 'UserNodes.csv',
 'IDNodes.csv',
 'Edges.csv']

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read full data and store a sample

In [9]:
df = pd.read_csv("2018-2021-Comments_Instagram_AmberHeard_Data.csv",
                  usecols = ["message", "n_replies", "n_likes", "from.id", "from.username", "from.is_verified" ])

In [10]:
df.head()

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
0,#scarlettjohansson #alexandradaddario #emiliac...,0.0,0.0,3988264000.0,scarlettjohansonoffcial,False
1,🔥🔥❤️,0.0,0.0,3988264000.0,scarlettjohansonoffcial,False
2,Very beautiful 😘❤️❤️❤️i love you very much 😍💘💘...,0.0,0.0,12302010000.0,fr.ederic2368,False
3,❤️❤️❤️❤️❤️❤️,0.0,0.0,2490417000.0,muataz_87,False
4,Bombón bello 💖💖💖💖💖,0.0,0.0,1521544000.0,jorgero122129,False


In [11]:
df.tail()

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
1973564,Damn,0.0,0.0,185358200.0,jpjmg,False
1973565,🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥,0.0,0.0,34632890000.0,sard.ar4835,False
1973566,🥰😍😘❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️❤️💋❤️💋,0.0,0.0,7202432000.0,daigle46,False
1973567,Beautiful,0.0,0.0,32074630000.0,davidwells305,False
1973568,Wow,0.0,0.0,5878318000.0,blydel80,False


In [12]:
df.shape

(1973569, 6)

In [13]:
df.dtypes

message              object
n_replies           float64
n_likes             float64
from.id             float64
from.username        object
from.is_verified     object
dtype: object

In [14]:
# Is data balanced?

df['from.is_verified'].value_counts()
print("\n")
df['from.is_verified'].value_counts(normalize = True)

False    1449932
True        2167
Name: from.is_verified, dtype: int64





False    0.998508
True     0.001492
Name: from.is_verified, dtype: float64

In [15]:
df.isnull().sum()

message                  0
n_replies           521470
n_likes                 47
from.id                 47
from.username           47
from.is_verified    521470
dtype: int64

### Store a sample of data

In [16]:
# As RAM is limited, we will work with a sample of 300000 entries
df = df.sample(5000)

In [17]:
df.isnull().sum()

message                0
n_replies           1294
n_likes                0
from.id                0
from.username          0
from.is_verified    1294
dtype: int64

In [18]:
df['n_replies'].fillna(0, inplace=True)
df['from.is_verified'].fillna("NotKnown", inplace=True)

In [19]:
df.isnull().sum()

message             0
n_replies           0
n_likes             0
from.id             0
from.username       0
from.is_verified    0
dtype: int64

In [20]:
df.head()
df.shape

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
58009,Lindas preciosa,0.0,0.0,3125513000.0,bloiseralmanzar,False
954935,💖💖😍,0.0,0.0,4244622000.0,edissonrodriguezf,False
595709,My fave!😍😍😍🔱,0.0,0.0,11093370000.0,thefootballguy81,False
371903,😍😍😍hello Baby,0.0,0.0,44496120000.0,gerhardstrefner1,False
286199,She is so beautiful,0.0,0.0,2554059000.0,hoffmann8935,False


(5000, 6)

### **Change from.username**

In [21]:
# To distinguish userids on network graph,
# we will prefix userids with 'u' and also assign
# them a short name so that they fit within nodes when displayed on graph

u_user = df['from.username'].unique()

In [22]:
u_user.sort()
u_user
print("\n")  
len(u_user)

array(['02sasha16', '0730bond', '0987jugo', ..., 'zyiahtyler',
       'zynpileri', 'zzumik'], dtype=object)





4569

In [23]:
u_code = ["u" + str(i) for i in range(len(u_user)) ]
u_code[:5]

['u0', 'u1', 'u2', 'u3', 'u4']

In [24]:
map_dict = dict(zip(u_user, u_code))

In [25]:
list(map_dict.items())[:3]

[('02sasha16', 'u0'), ('0730bond', 'u1'), ('0987jugo', 'u2')]

In [26]:
df['from.username'] = df['from.username'].replace(map_dict, inplace= False)

In [27]:
df.head()

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
58009,Lindas preciosa,0.0,0.0,3125513000.0,u725,False
954935,💖💖😍,0.0,0.0,4244622000.0,u1272,False
595709,My fave!😍😍😍🔱,0.0,0.0,11093370000.0,u4137,False
371903,😍😍😍hello Baby,0.0,0.0,44496120000.0,u1611,False
286199,She is so beautiful,0.0,0.0,2554059000.0,u1807,False


### **Change from.id**

In [28]:
i_user = df['from.id'].unique()

In [29]:
i_user.sort()
i_user
print("\n")  
len(i_user)

array([2.48937600e+06, 2.60118100e+06, 3.38923100e+06, ...,
       4.84110303e+10, 4.87451531e+10, 4.87919559e+10])





4568

In [30]:
i_code = ["i" + str(i) for i in range(len(i_user)) ]
i_code[:5]

['i0', 'i1', 'i2', 'i3', 'i4']

In [31]:
map_dict = dict(zip(i_user, i_code))

In [32]:
list(map_dict.items())[:3]

[(2489376.0, 'i0'), (2601181.0, 'i1'), (3389231.0, 'i2')]

In [33]:
df['from.id'] = df['from.id'].replace(map_dict, inplace= False)

In [34]:
df.head()

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
58009,Lindas preciosa,0.0,0.0,i1498,u725,False
954935,💖💖😍,0.0,0.0,i1894,u1272,False
595709,My fave!😍😍😍🔱,0.0,0.0,i3309,u4137,False
371903,😍😍😍hello Baby,0.0,0.0,i4383,u1611,False
286199,She is so beautiful,0.0,0.0,i1373,u1807,False


## **Username Nodes**

In [35]:
# Group by from.username to create nodes by no. of likes

grpd_user = df.groupby(['from.username'])
user_nodes = grpd_user.agg({'n_likes' : [('u_min','min'),('u_max','max'),('u_mean','mean')] }).reset_index()
user_nodes = user_nodes.round(decimals = 2)
user_nodes.head()
user_nodes.shape

Unnamed: 0_level_0,from.username,n_likes,n_likes,n_likes
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean
0,u0,2.0,2.0,2.0
1,u1,0.0,0.0,0.0
2,u10,0.0,0.0,0.0
3,u100,1.0,1.0,1.0
4,u1000,5.0,5.0,5.0


(4569, 4)

In [36]:
# Which of the users have non-verified accounts

users_nverif = df.loc[df['from.is_verified'] == False, 'from.username' ].unique()
len(users_nverif)

3418

In [37]:
df['from.is_verified'].value_counts()

False       3696
NotKnown    1294
True          10
Name: from.is_verified, dtype: int64

In [38]:
# Adding column 'verification_status' with default value of 'form.is_verified'

user_nodes['verification_status'] = df['from.is_verified']
user_nodes['Cat'] = 'InstagramUser'

In [39]:
# Creating a 'Label' column

user_nodes['Label'] = user_nodes['from.username']
user_nodes = user_nodes.rename(columns = {'from.username': "ID"})

In [40]:
user_nodes.head()

Unnamed: 0_level_0,ID,n_likes,n_likes,n_likes,verification_status,Cat,Label
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,u0,2.0,2.0,2.0,,InstagramUser,u0
1,u1,0.0,0.0,0.0,,InstagramUser,u1
2,u10,0.0,0.0,0.0,,InstagramUser,u10
3,u100,1.0,1.0,1.0,,InstagramUser,u100
4,u1000,5.0,5.0,5.0,,InstagramUser,u1000


In [41]:
# We also create 'verification_status_n' that records;
# 1 for True
# 0 for False
# 2 for NotKnown

user_nodes['verification_status_n'] = user_nodes['verification_status']
user_nodes['verification_status_n'] = user_nodes['verification_status_n'].map({True : 1 , False : 0, "NotKnown" : 2})

In [42]:
user_nodes.head()

Unnamed: 0_level_0,ID,n_likes,n_likes,n_likes,verification_status,Cat,Label,verification_status_n
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,u0,2.0,2.0,2.0,,InstagramUser,u0,
1,u1,0.0,0.0,0.0,,InstagramUser,u1,
2,u10,0.0,0.0,0.0,,InstagramUser,u10,
3,u100,1.0,1.0,1.0,,InstagramUser,u100,
4,u1000,5.0,5.0,5.0,,InstagramUser,u1000,


In [43]:
user_nodes.to_csv("UserNodes.csv",
                  index = False,
                  sep = ";"
                  )

## **ID Nodes**

In [44]:
df.head()

Unnamed: 0,message,n_replies,n_likes,from.id,from.username,from.is_verified
58009,Lindas preciosa,0.0,0.0,i1498,u725,False
954935,💖💖😍,0.0,0.0,i1894,u1272,False
595709,My fave!😍😍😍🔱,0.0,0.0,i3309,u4137,False
371903,😍😍😍hello Baby,0.0,0.0,i4383,u1611,False
286199,She is so beautiful,0.0,0.0,i1373,u1807,False


In [45]:
# Group by from.id to create nodes by no. of Likes

grpd_id = df.groupby(['from.id'])
id_nodes = grpd_id.agg({'n_likes' : [('id_min','min'),('id_max','max'),('id_mean','mean')] }).reset_index()
id_nodes = id_nodes.round(decimals = 2)
id_nodes.head()
id_nodes.shape

Unnamed: 0_level_0,from.id,n_likes,n_likes,n_likes
Unnamed: 0_level_1,Unnamed: 1_level_1,id_min,id_max,id_mean
0,i0,2.0,2.0,2.0
1,i1,1.0,1.0,1.0
2,i10,15.0,15.0,15.0
3,i100,0.0,0.0,0.0
4,i1000,0.0,0.0,0.0


(4568, 4)

In [46]:
# Which of the IDs have non-verified accounts

df.loc[df['from.is_verified'] == False, 'from.id'].unique()
id_nverif = df.loc[df['from.is_verified'] == False, 'from.id'].unique()
len(id_nverif)

array(['i1498', 'i1894', 'i3309', ..., 'i1397', 'i2162', 'i4513'],
      dtype=object)

3418

In [47]:
# Adding column 'verified_status' with default value of 'form.is_verified'

id_nodes['verified_status'] = df['from.is_verified']
id_nodes['Cat'] = 'InstaID'

In [48]:
# Creating a 'Label' column

id_nodes['Label'] = id_nodes['from.id']
id_nodes = id_nodes.rename(columns = {'from.id': "ID"})

In [49]:
id_nodes.head()

Unnamed: 0_level_0,ID,n_likes,n_likes,n_likes,verified_status,Cat,Label
Unnamed: 0_level_1,Unnamed: 1_level_1,id_min,id_max,id_mean,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,i0,2.0,2.0,2.0,,InstaID,i0
1,i1,1.0,1.0,1.0,,InstaID,i1
2,i10,15.0,15.0,15.0,,InstaID,i10
3,i100,0.0,0.0,0.0,,InstaID,i100
4,i1000,0.0,0.0,0.0,,InstaID,i1000


In [50]:
# We also create 'verification_status_n' that records;
# 1 for True
# 0 for False
# 2 for NotKnown

id_nodes['verified_status_n'] = id_nodes['verified_status']
id_nodes['verified_status_n'] = id_nodes['verified_status_n'].map({True : 1 , False : 0, "NotKnown" : 2})

In [51]:
id_nodes.head()

Unnamed: 0_level_0,ID,n_likes,n_likes,n_likes,verified_status,Cat,Label,verified_status_n
Unnamed: 0_level_1,Unnamed: 1_level_1,id_min,id_max,id_mean,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,i0,2.0,2.0,2.0,,InstaID,i0,
1,i1,1.0,1.0,1.0,,InstaID,i1,
2,i10,15.0,15.0,15.0,,InstaID,i10,
3,i100,0.0,0.0,0.0,,InstaID,i100,
4,i1000,0.0,0.0,0.0,,InstaID,i1000,


In [52]:
id_nodes.to_csv("IDNodes.csv",
                  index = False,
                  sep = ";"
                  )

## **Edges**

In [53]:
# Group by User and ID:

edges = df.groupby([df['from.username'], df['from.id']]).size()

In [54]:
# The size column has a name '0'. We need to rename it:

edges = df[['from.username', 'from.id']].groupby(['from.username', 'from.id']).size().reset_index()
edges = edges.rename(columns = { 0 : "weight"})
edges.head()


Unnamed: 0,from.username,from.id,weight
0,u0,i2598,1
1,u1,i146,1
2,u10,i1477,1
3,u100,i1265,1
4,u1000,i621,1


In [55]:
edges = edges.rename(columns = {'from.username': 'Source', 'from.id': 'Target'})

In [56]:
# Our edges are Undirected:

edges['Type']= 'Undirected'
edges.head()

Unnamed: 0,Source,Target,weight,Type
0,u0,i2598,1,Undirected
1,u1,i146,1,Undirected
2,u10,i1477,1,Undirected
3,u100,i1265,1,Undirected
4,u1000,i621,1,Undirected


In [57]:
edges.to_csv("Edges.csv",
                  index = False,
                  sep = ";"
                  )