In [152]:
from random import choice
import json
import re
import os
import requests
import time
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
 
_user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]
 
def get_nested(data, *args): #Recursive function to get nested json params
    if args and data:
        element  = args[0]
        if element:
            value = data.get(element)
            return value if len(args) == 1 else get_nested(value, *args[1:])   
     
 
class InstagramScraper:
 
    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy
 
    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)
 
    def __request_url(self, url):
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                                 'https': self.proxy})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text
        
   

    
    @staticmethod
    def extract_json_data(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)
 
    def profile_page_metrics(self, profile_url):
        if not os.path.exists("InstagramImages/"+profile_url):
            os.makedirs("InstagramImages/"+profile_url) #create path for images
        ig_url = 'https://www.instagram.com/'+profile_url
        results = {}
        try:
            response = self.__request_url(ig_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    else: #value:
                        results[key] = value

        urllib.request.urlretrieve(results['profile_pic_url'],'InstagramImages/'+profile_url+'/profile.jpg') #save all images    
        return results

        
   
    def profile_page_recent_posts(self, profile_url):
        ig_url = 'https://www.instagram.com/'+profile_url

        big_results = []  
        i=1
        try:
            response = self.__request_url(ig_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
        except Exception as e:
            raise e
        else:
            for node in metrics:
                
                results = {}
                node = node.get('node')
                for key, value in node.items():
                    if value and isinstance(value, dict):
                        if key == 'edge_media_to_caption':
                            temp = get_nested(value, "edges")
                            if temp:
                                results[key] = get_nested(temp[0],"node", "text")
                            else:
                                results[key] = "None"
                        elif key == 'edge_media_to_comment':
                            results[key] = value['count']
                        elif key == 'edge_liked_by':
                            results[key] = value['count']
                        elif key == 'edge_media_preview_like':
                            results[key] = value['count'] 
                        elif key == 'location':
                            results[key] = value['name']
                        else:
                            results[key] = value
                    else:
                        results[key] = value
                urllib.request.urlretrieve(node['display_url'],'InstagramImages/'+profile_url+'/%i.jpg'%i)    
                i+=1
                big_results.append(results)
        
        return big_results
    
    def createProfileData(self, profile_url): #function to create json for user
        pageResults = self.profile_page_metrics(profile_url)
        postResults = self.profile_page_recent_posts(profile_url)
        if not postResults:
            return
        pageResults['posts']=postResults
        with open('UserData/'+profile_url+'.json', 'w', encoding="utf-8") as f:
          json.dump(pageResults, f, ensure_ascii=False)

In [2]:
with open('final_user_list.txt', 'r') as f:
    newList = f.read().splitlines()

In [89]:
len(newList)

1947

In [421]:
second_List = newList[1903:] #code to run scraper in parts

In [None]:
#to remove 
#cutegirlshairstyles
#1misssmeis
#the_salty_blonde
#ohwowfashion
#janeteaestranha
#xeniaoverdose
#emilyostbro
#wristtakers
#pilotpatric
#sherifartisan
#itsanuthida
#yarasantosoficial
#tomer_gelb
#abanddoned
#thevisualscollective
#forn_sant_francesc
#elenacarriere.official
#nadiadamaso_ebnl
#madwhips_italian
#leagueofupdates
#lyzabeth_lopez
#lenas_view

In [424]:
instagram_scraper = InstagramScraper()
for item in second_List:
    instagram_scraper.createProfileData(item) #scrape

In [432]:
UserDataPath = "UserData/"
userDataList = []
for file in os.listdir(UserDataPath): #create filelist with all user json files
    if file.endswith(".json"):
        userDataList.append(file)

In [457]:
len(userDataList)

1897

In [460]:
import pandas as pd
import numpy as np
firstPostDone=0
firstUserDone=0

profileColumns = ['biography', 'business_category_name', 'connected_fb_page', 'country_block', 'edge_felix_video_timeline', 'edge_follow', 'edge_followed_by', 'edge_media_collections','edge_owner_to_timeline_media','edge_saved_media','external_url','external_url_linkshimmed','full_name','has_channel','highlight_reel_count', 'id','is_business_account','is_joined_recently','is_private','is_verified', 'profile_pic_url', 'profile_pic_url_hd', 'username'] 
postColumns = ['accessibility_caption', 'comments_disabled', 'edge_liked_by', 'display_url', 'edge_media_preview_like', 'edge_media_to_caption', 'edge_media_to_comment', 'is_video', 'location', 'taken_at_timestamp', 'hashtags', 'mentions','time_between','number_of_likes/mean', 'number_of_likes-median']   
for k in userDataList:
    with open(Path + str(k), encoding="utf-8") as json_data:
        data = json.load(json_data) #open file

        profileData=dict((k, data[k]) for k in profileColumns if k in data) #create dict from json
        profileDF=pd.DataFrame.from_dict(profileData, orient='index').T  #create dataframe

        postData = data['posts']
        for i in range(1,len(postData)): #run for all posts
            for k in postColumns:
                if k in postData[i]:
                    if isinstance(postData[i][k], str):
                        postData[i][k]=postData[i][k].replace('\n','')
                else:
                    postData[i][k] = "None"
                if k == 'edge_media_to_caption':
                    postData[i]["hashtags"] = [i[1:] for i in postData[i][k].split() if i.startswith("#")] #create hashtag field
                    postData[i]["mentions"] = [i[1:] for i in postData[i][k].split() if i.startswith("@")] #create mentions field  
            post = dict((k, postData[i][k]) for k in postColumns if k in postData[i])
            
            if (firstPostDone == 0):
                totalPosts = np.hstack((profileDF, pd.DataFrame.from_dict(post, orient='index').T))
                firstPostDone=1
            else:
                nextPost = np.hstack((profileDF, pd.DataFrame.from_dict(post, orient='index').T))
                totalPosts = np.vstack((totalPosts,nextPost)) 
                
        intermediateDF=pd.DataFrame(totalPosts, columns=list((profileColumns))+list((postColumns)))
        intermediateDF['time_between'] = intermediateDF['taken_at_timestamp'] - intermediateDF['taken_at_timestamp'].shift(-1)
        intermediateDF['time_between'].fillna((intermediateDF['time_between'].mean()), inplace=True)
        intermediateDF['number_of_likes/mean'] = intermediateDF['edge_liked_by'].divide(intermediateDF['edge_liked_by'].mean())
        intermediateDF['number_of_likes-median'] = intermediateDF['edge_liked_by'].subtract(intermediateDF['edge_liked_by'].median())
                     
        if (firstUserDone == 0):
            aggregateData = intermediateDF
            firstUserDone=1
        else:
            aggregateData=np.vstack((aggregateData,pd.DataFrame(intermediateDF, columns=list((profileColumns))+list((postColumns)))))          
        firstPostDone=0  

In [461]:
InstagramDataset=pd.DataFrame(aggregateData, columns=profileColumns+postColumns)

In [462]:
InstagramDataset

Unnamed: 0,biography,business_category_name,connected_fb_page,country_block,edge_felix_video_timeline,edge_follow,edge_followed_by,edge_media_collections,edge_owner_to_timeline_media,edge_saved_media,...,edge_media_to_caption,edge_media_to_comment,is_video,location,taken_at_timestamp,hashtags,mentions,time_between,number_of_likes/mean,number_of_likes-median
0,,,,False,0,11,1927598,0,19,0,...,O'CLOCK이 발매되었습니다!!! 여러분들의 많은 관심과 사랑부탁드릴께요❤️,14032,False,,1553620976,[],[],1.07243e+06,0.697678,-274840
1,,,,False,0,11,1927598,0,19,0,...,즐거운 화이트데이~~메이트데이~~~🧛‍♂️❤️,35200,False,,1552548548,[],[],60664,1.13228,55050
2,,,,False,0,11,1927598,0,19,0,...,O'CLOCK 곧 12시죠~! 두구두구 티저 이미지 공개! 😆,10813,False,,1552487884,[],[],458221,0.641902,-317178
3,,,,False,0,11,1927598,0,19,0,...,,25322,False,,1552029663,[],[],1.89476e+06,1.07279,9896
4,,,,False,0,11,1927598,0,19,0,...,오늘 발렌타인데이인데 초콜렛말고 제 하트받으실래요?,30816,False,,1550134904,[],[],409972,1.05975,0
5,,,,False,0,11,1927598,0,19,0,...,빼꼼! 메이 사랑해요~~~,24037,False,,1549724932,[],[],438860,1.17763,89474
6,,,,False,0,11,1927598,0,19,0,...,복 많이받으세요!!!!❤️😆,29381,False,,1549286072,[],[],177578,1.10168,31824
7,,,,False,0,11,1927598,0,19,0,...,이젠 어렵지 않아요,35140,False,,1549108494,[],[],84033,1.15427,71747
8,,,,False,0,11,1927598,0,19,0,...,다음번에는 예고없이 바로 방송킬꺼에요🤨,24378,False,,1549024461,[],[],14841,1.01291,-35554
9,,,,False,0,11,1927598,0,19,0,...,곧 만나요🐰 먕먕~~!,26407,False,,1549009620,[],[],62623,1.05749,-1718


In [472]:
InstagramDataset.to_csv('InstagramDataset.csv', index=False)

In [473]:
InstagramDataset['day'] = InstagramDataset['taken_at_timestamp'].apply(lambda x: (datetime.datetime.fromtimestamp(x)).strftime('%A'))

In [474]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
data1_dummy = pd.get_dummies(InstagramDataset["day"])
InstagramDataset = InstagramDataset.join(data1_dummy)

In [475]:
InstagramDataset['hour_of_day'] = InstagramDataset['taken_at_timestamp'].apply(lambda x: int((datetime.datetime.fromtimestamp(x)).strftime('%H')))

In [476]:
InstagramDataset['hr_sin'] = np.sin(InstagramDataset.hour_of_day*(2.*np.pi/24))
InstagramDataset['hr_cos'] = np.cos(InstagramDataset.hour_of_day*(2.*np.pi/24))

In [477]:
bins = [0, 4, 8, 12, 16, 20, 24]
InstagramDataset['HourBin'] = pd.cut(InstagramDataset['hour_of_day'], bins)

In [491]:
InstagramDataset = pd.concat([InstagramDataset,pd.get_dummies(InstagramDataset['HourBin'])],axis=1)

In [492]:
InstagramDataset

Unnamed: 0,biography,business_category_name,connected_fb_page,country_block,edge_felix_video_timeline,edge_follow,edge_followed_by,edge_media_collections,edge_owner_to_timeline_media,edge_saved_media,...,hour_of_day,hr_sin,hr_cos,HourBin,"(0, 4]","(4, 8]","(8, 12]","(12, 16]","(16, 20]","(20, 24]"
0,,,,False,0,11,1927598,0,19,0,...,12,1.224647e-16,-1.000000e+00,"(8, 12]",0,0,1,0,0,0
1,,,,False,0,11,1927598,0,19,0,...,2,5.000000e-01,8.660254e-01,"(0, 4]",1,0,0,0,0,0
2,,,,False,0,11,1927598,0,19,0,...,9,7.071068e-01,-7.071068e-01,"(8, 12]",0,0,1,0,0,0
3,,,,False,0,11,1927598,0,19,0,...,1,2.588190e-01,9.659258e-01,"(0, 4]",1,0,0,0,0,0
4,,,,False,0,11,1927598,0,19,0,...,3,7.071068e-01,7.071068e-01,"(0, 4]",1,0,0,0,0,0
5,,,,False,0,11,1927598,0,19,0,...,9,7.071068e-01,-7.071068e-01,"(8, 12]",0,0,1,0,0,0
6,,,,False,0,11,1927598,0,19,0,...,7,9.659258e-01,-2.588190e-01,"(4, 8]",0,1,0,0,0,0
7,,,,False,0,11,1927598,0,19,0,...,5,9.659258e-01,2.588190e-01,"(4, 8]",0,1,0,0,0,0
8,,,,False,0,11,1927598,0,19,0,...,6,1.000000e+00,6.123234e-17,"(4, 8]",0,1,0,0,0,0
9,,,,False,0,11,1927598,0,19,0,...,2,5.000000e-01,8.660254e-01,"(0, 4]",1,0,0,0,0,0


In [498]:
InstagramDataset = pd.concat([InstagramDataset,pd.get_dummies(InstagramDataset['business_category_name'])],axis=1)

In [500]:
InstagramDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20778 entries, 0 to 20777
Data columns (total 74 columns):
biography                                      20778 non-null object
business_category_name                         11020 non-null object
connected_fb_page                              0 non-null object
country_block                                  20778 non-null object
edge_felix_video_timeline                      20778 non-null object
edge_follow                                    20778 non-null object
edge_followed_by                               20778 non-null object
edge_media_collections                         20778 non-null object
edge_owner_to_timeline_media                   20778 non-null object
edge_saved_media                               20778 non-null object
external_url                                   16869 non-null object
external_url_linkshimmed                       16869 non-null object
full_name                                      20778 non-null objec

In [493]:
InstagramDataset.to_csv('InstagramDatasetAddedFeatures.csv', index=False)

In [501]:
InstagramDataset.to_csv('InstagramDatasetAddedFeaturesVersion3.csv', encoding='utf-8-sig', index=False)

In [38]:
InstagramDF=pd.read_csv('InstagramDatasetFinal.csv', encoding='utf-8-sig')

In [77]:
InstagramDF['username'].value_counts()

ana_lombardini          11
taylor_hill             11
mija_mija               11
officialslystallone     11
elaine_yiu              11
henrycavill             11
belle.delphine          11
soyandreazuniga         11
peytonlist              11
stevemccurryofficial    11
gavinoneillphoto        11
marcusbutler            11
d_degeaofficial         11
drewtrush               11
mosalah                 11
faheeym                 11
val.mercado             11
alejandroflrs           11
realdiddykong           11
lukehemmings            11
gururandhawa            11
laurenjauregui          11
soniatlevfitness        11
jacenorman              11
belenrodriguezreal      11
youngmeerim             11
flettemamma             11
nancyajram              11
gusttavolima            11
chloegmoretz            11
                        ..
juicewrld999            11
haileesteinfeld         11
meeeeeeeel_             11
giizeleoliveira         11
ashtonirwin             11
fcbarcelona             11
m

In [76]:
InstagramDF.loc[InstagramDF['username'] == "vegas_nay"]["edge_liked_by"]

19524     42072
19525      6419
19526     24822
19527     12808
19528     11768
19529     31966
19530      9827
19531    124318
19532     46520
19533     46476
19534     39021
Name: edge_liked_by, dtype: int64

In [74]:
InstagramDF.drop(InstagramDF.index[19524:19535], inplace=True)

In [75]:
InstagramDF = InstagramDF.reset_index(drop=True)

In [78]:
InstagramDF

Unnamed: 0,biography,business_category_name,connected_fb_page,country_block,edge_felix_video_timeline,edge_follow,edge_followed_by,edge_media_collections,edge_owner_to_timeline_media,edge_saved_media,...,Home Services,Lifestyle Services,Local Events,Non-Profits & Religious Organizations,Personal Goods & General Merchandise Stores,Professional Services,Publishers,Transportation & Accomodation Services,mean_likes,mean_comments
0,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
1,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
2,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
3,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
4,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
5,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
6,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
7,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
8,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727
9,,,,False,0,11,1927598,0,19,0,...,0,0,0,0,0,0,0,0,759069.545455,25191.272727


In [79]:
InstagramDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20668 entries, 0 to 20667
Data columns (total 76 columns):
biography                                      18721 non-null object
business_category_name                         10932 non-null object
connected_fb_page                              0 non-null float64
country_block                                  20668 non-null bool
edge_felix_video_timeline                      20668 non-null int64
edge_follow                                    20668 non-null int64
edge_followed_by                               20668 non-null int64
edge_media_collections                         20668 non-null int64
edge_owner_to_timeline_media                   20668 non-null int64
edge_saved_media                               20668 non-null int64
external_url                                   16781 non-null object
external_url_linkshimmed                       16781 non-null object
full_name                                      20340 non-null object
has_c

In [81]:
InstagramDF.to_csv('InstagramDatasetFinalVersion.csv', encoding='utf-8-sig', index=False)