In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import random
import re
import os

%matplotlib inline
import matplotlib.pyplot as plt
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

From specifications and descriptions of the dataset we can infer the proper `dtype` for most of the columns:
- numerical : `int`,`float`
- text : `str`
- A low/finite number of values : `categorical`,`boolean`

In [21]:
class Dataset:
    def __init__(self, file:str):
        self.file_name : str = file
        self.types : dict = self.load_types()
        self.df : pd.DataFrame = None
        
    def name(self):
        return re.search(r'\/([\w\d]*)\.csv',self.file_name).group(1)
    
    def __types_file__(self):
        return 'kaggle/working/'+ self.name() +'.npy'
        
    def save_types(self):
        np.save(self.__types_file__(), self.types)
        
    def load_types(self):
        if os.path.isfile(self.__types_file__()):
            self.types = np.load(self.__types_file__(),allow_pickle='TRUE').item()
            return self.types
        return None
    
    def col(self,columns:list, index:bool=True,**pd_params)->pd.DataFrame:
        """Loads some columns of the dataframe out of the whole csv file
        
        :param columns: a list of the desired columns
        :type columns: list
        :param index: if True loads also the 'sid' column as the index
        :type index: bool
        :param **pd_params: any other params for pd.read_csv(...)
        :rtype: pd.DataFrame
        """
        col_list = columns
        if 'cts' in col_list:
            pd_params['parse_dates'] = ['cts']
        if index :
            col_list.append('sid')
        return pd.read_csv(self.file_name,
                           usecols=col_list,
                           dtype=self.types, 
                           index_col='sid' if index else None,
                           delimiter='\t', 
                           **pd_params)

datasets = [
    Dataset(file='../input/instagram-dataset/instagram_profiles.csv'), 
    Dataset(file='../input/instagram-dataset/instagram_locations.csv'), 
    Dataset(file='../input/instagram-dataset/instagram_posts.csv')
]
profiles, locations, posts = datasets

In [22]:
profiles.types = {
#     'sid' : 'Int64',
    'profile_id' : 'Int64',
    'profile_name' : pd.StringDtype(storage='pyarrow'),
    'firstname_lastname' : pd.StringDtype(storage='pyarrow'),
    'description' : pd.StringDtype(storage='pyarrow'),
    'following' : 'Int64',
    'followers' : 'Int64',
    'n_posts' : 'Int64',
    'url' : pd.StringDtype(storage='pyarrow'),
#     'cts' : pd.DatetimeTZDtype(tz='UTC'), #obtained through parse_dates=['cts']
    'is_business_account' : pd.BooleanDtype()
}
posts.types = {
#     'sid' : 'Int64', # gives an error, maybe because it's the index?
    'sid_profile' : 'Int64',
    'post_id' : pd.StringDtype(storage='pyarrow'),
    'profile_id' : 'Int64',
    'location_id' : 'Int64',
    'description' : pd.StringDtype(storage='pyarrow'),
    'post_type' : pd.CategoricalDtype(categories=[1,2,3]),
    'numbr_likes' : 'Int64',
    'number_comments' : 'Int64'
}
locations.types = {
#     'sid' : 'Int64', # gives an error, maybe because it's the index?
    'id' : 'Int64',
    'name' : pd.StringDtype(storage='pyarrow'),
    'street' : pd.StringDtype(storage='pyarrow'),
    'zip' : pd.StringDtype(storage='pyarrow'),
    'city' : pd.StringDtype(storage='pyarrow'),
    'region' : pd.StringDtype(storage='pyarrow'),
    'cd' : pd.CategoricalDtype(),
    'phone' : pd.StringDtype(storage='pyarrow'),
    'aj_exact_city_match' : pd.BooleanDtype(),
    'aj_exact_country_match' : pd.BooleanDtype(),
    'blurb' : pd.StringDtype(storage='pyarrow'),
    'dir_city_id' : pd.StringDtype(storage='pyarrow'),
    'dir_city_name' : pd.StringDtype(storage='pyarrow'),
    'dir_city_slug' : pd.StringDtype(storage='pyarrow'),
    'dir_country_id' : pd.CategoricalDtype(),
    'dir_country_name' : pd.CategoricalDtype(),
    'lat' : pd.Float32Dtype(),
    'lng' : pd.Float32Dtype(),
    'primary_alias_on_fb' : pd.StringDtype(storage='pyarrow'),
    'slug' : pd.StringDtype(storage='pyarrow'),
    'website' : pd.StringDtype(storage='pyarrow'),
}

In [23]:
c = locations.col(['lat', 'lng', 'cts'], index=False, nrows=10)
c

Unnamed: 0,lat,lng,cts
0,-0.139475,38.974392,2019-05-29 01:21:29.987
1,18.466667,47.866669,2019-05-29 01:21:38.037
2,85.330147,27.70196,2019-05-29 01:21:46.295
3,2.375995,48.82724,2019-05-29 01:21:54.355
4,-118.341866,34.153267,2019-04-02 15:22:55.703
5,-121.621552,37.925411,2019-04-02 15:22:55.367
6,103.94973,1.355203,2019-05-29 01:21:56.635
7,150.694366,-33.75103,2019-05-29 01:22:12.909
8,100.073586,15.329776,2019-05-29 01:22:27.749
9,23.766262,61.495689,2019-05-29 01:22:35.936


In [6]:
locations.types

{'id': 'Int64',
 'name': string[pyarrow],
 'street': string[pyarrow],
 'zip': string[pyarrow],
 'city': string[pyarrow],
 'region': string[pyarrow],
 'cd': CategoricalDtype(categories=None, ordered=False),
 'phone': string[pyarrow],
 'aj_exact_city_match': BooleanDtype,
 'aj_exact_country_match': BooleanDtype,
 'blurb': string[pyarrow],
 'dir_city_id': string[pyarrow],
 'dir_city_name': string[pyarrow],
 'dir_city_slug': string[pyarrow],
 'dir_country_id': CategoricalDtype(categories=None, ordered=False),
 'dir_country_name': CategoricalDtype(categories=None, ordered=False),
 'lat': Float32Dtype(),
 'lng': Float32Dtype(),
 'primary_alias_on_fb': string[pyarrow],
 'slug': string[pyarrow],
 'website': string[pyarrow]}

In [7]:
def get_types(signed=True, unsigned=True, custom=[]):
    '''Returns a pandas dataframe containing the boundaries of each integer dtype'''
    # based on https://stackoverflow.com/a/57894540/9419492
    pd_types = custom
    if signed:
        pd_types += [pd.Int8Dtype() ,pd.Int16Dtype() ,pd.Int32Dtype(), pd.Int64Dtype()]
    if unsigned:
        pd_types += [pd.UInt8Dtype() ,pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()]
    type_df = pd.DataFrame(data=pd_types, columns=['pd_type'])
    type_df['np_type'] = type_df['pd_type'].apply(lambda t: t.numpy_dtype)
    type_df['min_value'] = type_df['np_type'].apply(lambda row: np.iinfo(row).min)
    type_df['max_value'] = type_df['np_type'].apply(lambda row: np.iinfo(row).max)
    type_df['allow_negatives'] = type_df['min_value'] < 0
    type_df['size'] = type_df['np_type'].apply(lambda row: row.itemsize)
    type_df.sort_values(by=['size', 'allow_negatives'], inplace=True)
    return type_df.reset_index(drop=True)
    
get_types()

Unnamed: 0,pd_type,np_type,min_value,max_value,allow_negatives,size
0,UInt8,uint8,0,255,False,1
1,Int8,int8,-128,127,True,1
2,UInt16,uint16,0,65535,False,2
3,Int16,int16,-32768,32767,True,2
4,UInt32,uint32,0,4294967295,False,4
5,Int32,int32,-2147483648,2147483647,True,4
6,UInt64,uint64,0,18446744073709551615,False,8
7,Int64,int64,-9223372036854775808,9223372036854775807,True,8


In [8]:
def downcast_int(file_path, column:str, chunksize=10000, delimiter=',', signed=True, unsigned=True):
    '''Assigns the smallest possible dtype to an integer column of a csv'''
    types = get_types(signed, unsigned)
    negatives = False
    print(delimiter)
    for chunk in pd.read_csv(file_path, 
                             usecols=[column],
                             delimiter=delimiter,
                             chunksize=chunksize):
        M = chunk[column].max()
        m = chunk[column].min()
        if not signed and not negatives and m < 0 :
            types = types[types['allow_negatives']] # removes unsigned rows
            negatives = True
        if m < types['min_value'].iloc[0]:
            types = types[types['min_value'] < m]
        if M > types['max_value'].iloc[0]:
            types = types[types['max_value'] > M]
        if len(types) == 1:
            print('early stop')
            break
    return types['pd_type'].iloc[0]

def optimize_cgols(file, int_cols, delimiter=',', signed=True, unsigned=True):
    out = dict()
    for col in int_cols:
        out[col] = downcast_int(file, col, delimiter=delimiter, signed=signed, unsigned=unsigned)
    return out

In [24]:
for ds in datasets:
    if not ds.load_types():
        int_cols = [k for k,v in ds.types.items() if v == 'Int64']
        print(int_cols)
        ds.types.update(optimize_cols(ds.file_name, int_cols, delimiter='\t'))
        print(f'Optimized {len(int_cols)} types for {ds.name()}')
#     ds.types.update({k:pd.StringDtype(storage='pyarrow') for k,v in ds.types.items() if v==pd.StringDtype(storage='pyarrow')})
    ds.save_types()
#     print(ds.types)

['profile_id', 'following', 'followers', 'n_posts']
	
	
	
	
Optimized 4 types for instagram_profiles


FileNotFoundError: [Errno 2] No such file or directory: 'kaggle/working/instagram_profiles.npy'

In [12]:
SAMPLE_SIZE = 10000
for ds in datasets:
    ds.df = pd.read_csv(ds.file_name, dtype=ds.types, index_col='sid', delimiter='\t', parse_dates=['cts'], nrows=SAMPLE_SIZE)
    avg_mem_unoptimized = pd.read_csv(ds.file_name, index_col='sid', delimiter='\t', nrows=SAMPLE_SIZE).memory_usage(deep=True).sum()/SAMPLE_SIZE
    avg_mem_optimized = ds.df.memory_usage(deep=True).sum()/SAMPLE_SIZE
    print(f'{ds.name().ljust(19)} mean optimized memory usage per entry:  {round(avg_mem_optimized):3} B vs {round(avg_mem_unoptimized):4} B  : {round(avg_mem_optimized/avg_mem_unoptimized*100,2):5}%') 

instagram_profiles  mean optimized memory usage per entry:  162 B vs  542 B  : 29.93%
instagram_locations mean optimized memory usage per entry:  269 B vs 1117 B  : 24.11%
instagram_posts     mean optimized memory usage per entry:  285 B vs  752 B  : 37.96%


In [14]:
# Load test:  fails even with optimizing can't load everything in 16GB
# for ds in datasets:
#     ds.df = pd.read_csv(ds.file_name, dtype=ds.types, index_col='sid', delimiter='\t', parse_dates=['cts'])
#     print(f'Loaded {ds.name().ljust(19)}, size = {round(ds.df.memory_usage(deep=True).sum()/ 1024**3,2)}GB')
#     del ds.df
%timeit profiles.col(['n_posts']).head()

13.5 s ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Exploratory Data Analysis [RQ1]

### Profiles 
| Field | Description |
|---|---|
| SID | Sequence ID |
| profile_id | Instagrams ID |
| profile_name | profile name |
| firstname_lastname | firstname lastname |
| description | May contain '\n' |
| following | Number of following profile at the moment it was visited |
| followers | Number of followers at the moment it was visited |
| n_posts | Number of posts at the moment it was visited |
| url | Url in profile description |
| cts | Timestamp when the profile was visited |
| is_business_account | Boolean flag if that profile was makred by the owner as business account |

In [None]:
profiles.df.head()

In [None]:
profiles.df[['following', 'followers', 'n_posts']].describe()

In [None]:
profiles.df.info()

### Locations

| Field | Description |
|---|---|
| SID | Sequence ID |
| ID | Instagrams ID |
| Name | Locations Name |
| Street | Street Address, may contain '\n' |
| ZIP | Zip code |
| City | City Name |
| Region | Region |
| CD | Country Code |
| Phone | The phone in format as on the Instragram |
| aj_exact_city_match | The Instagrams Internal key |
| aj_exact_country_match | The Instagrams Internal key |
| blurb | Description of the place, may contain '\n' |
| dir_city_id | The Instagrams internal City ID |
| dir_city_name | city Name |
| dir_city_slug | City tag (sortof) |
| dir_country_id | Country ID |
| dir_country_name | country |
| lat | Latitude |
| lng | Longtitude |
| primary_alias_on_fb | Bool Flag |
| slug | ??? |
| website | The URL to web site, may contain more then 1 URL, may contain '\n' |
| cts | Timestamp when the location was visited |

In [None]:
locations.df.iloc[:,:10].head()

In [None]:
locations.df.iloc[:,10:].head()

In [None]:
locations.df.describe()
# it's useless

In [None]:
locations.df.info()

### Posts
| Field | Description |
|---|---|
| SID | Sequence ID |
| sid_profile | Sequence ID of the profile from *Profiles* table |
| post_id | Instagrams ID |
| profile_id | Instagrams ID may be null |
| location_id | Instagrams ID |
| cts | Timestamp when the Post was created |
| post_type | 1 - Photo, 2 - Video, 3 - multy |
| description | May contain '\n' |
| number_likes | Number of Likes at the moment it was visited |
| number_comments | Number of comments at the moment it was visited |

In [None]:
# posts.columns = posts.columns.str.replace('numbr', 'number')
posts.df.head()

In [None]:
posts.df[['number_likes','number_comments']].describe()

In [None]:
posts.df.info()

In [None]:
# # Chunking 
# for chunk in pandas.read_csv("FILE.csv", chunksize=1000):
#     pass

categorical## Before Starting
We note that this dataset has many odd values, with many columns having missing values that we'll have to deal somehow

Furthermore we need to adjust all those columns having datetime objects not actually being represented as objects

In [None]:
# done above

In [None]:
def count_problematic(df):
    at_least_one_null = len(df[pd.isnull(df).any(axis=1)])
    return at_least_one_null, len(df) - at_least_one_null

for i, df in enumerate([posts, locations, profiles]):
    plt.subplot(1,3,i+1)
#     plt.title(name)
    plt.pie(count_problematic(df), labels=['missing values', ''])

# [RQ2] 

#### Plot the number of posts for each profile in descending order.

In [None]:
profiles.df.sort_values(by='n_posts', ascending=False)[['profile_name', 'n_posts']]

#### What posts have the most number of likes?

In [None]:
posts.df.sort_values(by='numbr_likes', ascending=False).head(10)

#### What posts have the most and the least number of comments?

In [None]:
posts.df.sort_values(by='number_comments', ascending=False).head(10)

In [None]:
posts.df.sort_values(by='number_comments', ascending=True).head(10)

#### How many posts include tagged locations, and how many do not? Show it using an appropriate chart and comment your results.

In [None]:
null_locations = sum(posts['location_id'].isna())
plt.pie([null_locations, len(posts.df)-null_locations], labels=['null', 'tagged'])

#### How many posts include only photos? How many also have videos?

In [None]:
def categorize(mapping, values):
    res = pd.DataFrame.from_dict(mapping, orient='index', columns=['count'])
    for k,v in mapping.items():
        res.loc[k] = sum(values == v)
    res.loc['NA'] = sum(values.isna())
    if res.loc['NA'].sum() == 0:
        return res.loc[mapping.keys()]
    else:
        return res

In [None]:
post_types = posts['post_type'].value_counts(dropna=False)
sns.barplot(y=post_types, x=post_types.index).set(title='Post types')

#### What's the percentage of business accounts vs non-business? What can you interpret regarding that percentage?

In [None]:
business_accounts = profiles.df['is_business_account'].value_counts(dropna=False)
sns.barplot(x=[True, False, '<NA>'], y=business_accounts).set(title='Business Accounts')

### What is the most common time in which users publish their posts?

In [47]:
time = posts.col(['cts'], index=False, nrows=10000)

0      2017-08-06 21:00:00
1      2017-06-19 12:00:00
2      2017-03-11 21:00:00
3      2016-09-13 18:00:00
4      2015-10-18 12:00:00
               ...        
9995   2019-03-10 15:00:00
9996   2019-03-09 15:00:00
9997   2019-03-03 21:00:00
9998   2019-02-26 21:00:00
9999   2018-11-06 15:00:00
Name: cts, Length: 10000, dtype: datetime64[ns]

In [32]:
hours_type = pd.CategoricalDtype(categories.range(0,25))
time['hour'] = time['cts'].map(lambda x: x.hour)

In [41]:
time['hour'].value_counts()

17    598
19    589
20    586
18    573
15    566
21    548
14    528
16    511
13    456
22    432
12    424
23    418
0     369
11    368
10    367
9     349
1     339
3     327
8     301
4     297
2     283
5     269
7     253
6     249
Name: hour, dtype: int64

In [71]:
def intervals(h):
    if h >= 0 and h < 3:
        return 6
    elif h < 6:
        return 5
    elif h < 11:
        return 0
    elif h < 14:
        return 1
    elif h < 17:
        return 2
    elif h < 20:
        return 3
    elif h < 24:
        return 4
    return -1

intervals(23)

4