# Data refurbish

## Read JSON

In [111]:
from json import load


with open('./data/result.json', 'r') as f:
    result = load(f)

In [97]:
import pandas as pd

seriesDict = {}

for key, value in result.items():
    if not value:
        continue
    newValue = {}
    newValue['nameWithOwner'] = value['nameWithOwner']
    newValue['createdAt'] = value['createdAt']
    newValue['updatedAt'] = value['updatedAt']
    newValue['stars'] = value['stargazers']['totalCount']
    newValue['releases'] = value['releases']['totalCount']
    newValue['isFork'] = value['isFork']
    newValue['forkCount'] = value['forkCount']
    newValue['commitCount'] = value['defaultBranchRef']['target']['history']['totalCount']
    newValue['description'] = value['description']
    newValue['diskUsage'] = value['diskUsage']
    newValue['watchers'] = value['watchers']['totalCount']
    series = pd.Series(newValue)
    seriesDict[key] = series

In [98]:
dataFrame = pd.DataFrame(seriesDict)
dataFrame = dataFrame.T

## Change datatype 

In [99]:
dataFrame.dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars            object
releases         object
isFork           object
forkCount        object
commitCount      object
description      object
diskUsage        object
watchers         object
dtype: object

## Map timestamps

In [100]:
dataFrame.index

Index(['sa1_i3', 'GSA_seo', 'HHK1_AI', 'HR_term', 'burz_v8', 'f213_ca',
       'ii_iipi', 'nhu_ndb', 'res_R', 'rsms_tc',
       ...
       'apuentemedallia_toolsandtechniquesforvulnerabilityvalidation',
       'songsuoyuan_TheElementofStatisticalLearningPythonImplements',
       'stressandwellbeingclinic_stressandwellbeingclinicgithubio',
       'yath_android_prebuilts_gcc_linuxx8632_arm_armlinuxandroideabi46',
       'ordinarydeveloper_book_modern_c_plus_plus_programming_with_tdd_j_langr',
       'chandupatlas_android_prebuilts_gcc_linuxx8632_arm_armlinuxandroideabi47',
       'TamuGeoInnovation_TamuGeoInnovationCommonCoreGeoGeographicFeaturesParcels',
       'TRiggin_httpsgithubcomTRiggingooglejstemplateblobmasterjstemplate_jsunithtml',
       'F6HQZ_audiocontrolerdrivingI2Crotaryencodersdigitalpotentiometers24xLEDsbargraphsLCD',
       'ucdavisbioinformaticstraining_BioinformaticsGenomeAssemblyandAnalysisWorkshopPacBioand10xGenomics'],
      dtype='object', length=12716)

In [101]:
dataFrame.columns

Index(['nameWithOwner', 'createdAt', 'updatedAt', 'stars', 'releases',
       'isFork', 'forkCount', 'commitCount', 'description', 'diskUsage',
       'watchers'],
      dtype='object')

In [102]:
import numpy as np

data_types = {'nameWithOwner': str, 'createdAt': str, 'updatedAt': str, 'stars': np.int64,
              'releases': np.int64, 'isFork': np.bool_, 'forkCount': np.int64, 'commitCount': np.int64,
              'description': str, 'diskUsage': np.int64}

In [103]:
d_types = {'commitCount': np.int64}
dataFrame.astype(d_types).dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars            object
releases         object
isFork           object
forkCount        object
commitCount       int64
description      object
diskUsage        object
watchers         object
dtype: object

In [104]:
typed_df = dataFrame.astype(data_types)
typed_df.dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars             int64
releases          int64
isFork             bool
forkCount         int64
commitCount       int64
description      object
diskUsage         int64
watchers         object
dtype: object

## Map timestamps

In [105]:
from datetime import datetime


def convert_timestamp(timestamp):
    date = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')
    return date
    
    
convert_timestamp('2013-06-27T08:06:45Z')

datetime.datetime(2013, 6, 27, 8, 6, 45)

In [106]:
def to_timeseries(series):
    return series.map(convert_timestamp)
    

typed_df['updatedAt'] = to_timeseries(typed_df['updatedAt'])

In [107]:
typed_df.dtypes

nameWithOwner            object
createdAt                object
updatedAt        datetime64[ns]
stars                     int64
releases                  int64
isFork                     bool
forkCount                 int64
commitCount               int64
description              object
diskUsage                 int64
watchers                 object
dtype: object

In [110]:
typed_df.head()

Unnamed: 0,nameWithOwner,createdAt,updatedAt,stars,releases,isFork,forkCount,commitCount,description,diskUsage,watchers
sa1_i3,sa1/i3,2013-06-27T08:06:45Z,2013-12-24 17:27:25,0,0,False,0,4232,Mirror of i3 wm. http://i3wm.org/,6592,1
GSA_seo,GSA/seo,2015-02-02T19:01:42Z,2017-10-13 06:40:32,2,0,False,3,4,Resources and Materials for Search Engine Opti...,111,7
HHK1_AI,HHK1/AI,2015-08-27T17:33:06Z,2015-09-01 19:27:16,2,0,False,2,47,Memoire HEC,3125,2
HR_term,HR/term,2016-01-30T11:17:00Z,2017-08-17 15:58:52,3,1,False,0,38,A Terminal helper utility (for macOS),17,0
burz_v8,burz/v8,2013-12-18T19:58:52Z,2017-10-06 20:25:09,3,0,False,0,33,v8 - A game engine,196,1
