# Data refurbish

## Read JSON

In [65]:
from json import load


with open('./data/result.json', 'r') as f:
    result = load(f)

In [66]:
import pandas as pd

seriesDict = {}

for key, value in result.items():
    if not value:
        continue
    newValue = {}
    newValue['nameWithOwner'] = value['nameWithOwner']
    newValue['createdAt'] = value['createdAt']
    newValue['updatedAt'] = value['updatedAt']
    newValue['stars'] = value['stargazers']['totalCount']
    newValue['releases'] = value['releases']['totalCount']
    newValue['isFork'] = value['isFork']
    newValue['forkCount'] = value['forkCount']
    newValue['commitCount'] = value['defaultBranchRef']['target']['history']['totalCount']
    newValue['description'] = value['description']
    newValue['diskUsage'] = value['diskUsage']
    series = pd.Series(newValue)
    seriesDict[key] = series

In [67]:
dataFrame = pd.DataFrame(seriesDict)
dataFrame = dataFrame.T

## Change datatype 

In [68]:
dataFrame.dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars            object
releases         object
isFork           object
forkCount        object
commitCount      object
description      object
diskUsage        object
dtype: object

## Map timestamps

In [69]:
dataFrame.index

Index(['sa1_i3', 'GSA_seo', 'HHK1_AI', 'HR_term', 'burz_v8', 'f213_ca',
       'ii_iipi', 'nhu_ndb', 'res_R', 'rsms_tc',
       ...
       'apuentemedallia_toolsandtechniquesforvulnerabilityvalidation',
       'songsuoyuan_TheElementofStatisticalLearningPythonImplements',
       'stressandwellbeingclinic_stressandwellbeingclinicgithubio',
       'yath_android_prebuilts_gcc_linuxx8632_arm_armlinuxandroideabi46',
       'ordinarydeveloper_book_modern_c_plus_plus_programming_with_tdd_j_langr',
       'chandupatlas_android_prebuilts_gcc_linuxx8632_arm_armlinuxandroideabi47',
       'TamuGeoInnovation_TamuGeoInnovationCommonCoreGeoGeographicFeaturesParcels',
       'TRiggin_httpsgithubcomTRiggingooglejstemplateblobmasterjstemplate_jsunithtml',
       'F6HQZ_audiocontrolerdrivingI2Crotaryencodersdigitalpotentiometers24xLEDsbargraphsLCD',
       'ucdavisbioinformaticstraining_BioinformaticsGenomeAssemblyandAnalysisWorkshopPacBioand10xGenomics'],
      dtype='object', length=12717)

In [33]:
dataFrame.columns

Index(['nameWithOwner', 'createdAt', 'updatedAt', 'stars', 'releases',
       'isFork', 'forkCount', 'commitCount', 'description', 'diskUsage'],
      dtype='object')

In [70]:
import numpy as np

data_types = {'nameWithOwner': str, 'createdAt': str, 'updatedAt': str, 'stars': np.int64,
              'releases': np.int64, 'isFork': np.bool_, 'forkCount': np.int64, 'commitCount': np.int64,
              'description': str, 'diskUsage': np.int64}

In [71]:
d_types = {'commitCount': np.int64}
dataFrame.astype(d_types).dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars            object
releases         object
isFork           object
forkCount        object
commitCount       int64
description      object
diskUsage        object
dtype: object

In [72]:
typed_df = dataFrame.astype(data_types)
typed_df.dtypes

nameWithOwner    object
createdAt        object
updatedAt        object
stars             int64
releases          int64
isFork             bool
forkCount         int64
commitCount       int64
description      object
diskUsage         int64
dtype: object

## Map timestamps

In [73]:
from datetime import datetime


def convert_timestamp(timestamp):
    date = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')
    return date
    
    
convert_timestamp('2013-06-27T08:06:45Z')

datetime.datetime(2013, 6, 27, 8, 6, 45)

In [78]:
def to_timeseries(series):
    return series.map(convert_timestamp)
    

typed_df['updatedAt'] = to_timeseries(typed_df['updatedAt'])

In [79]:
typed_df.dtypes

nameWithOwner            object
createdAt        datetime64[ns]
updatedAt        datetime64[ns]
stars                     int64
releases                  int64
isFork                     bool
forkCount                 int64
commitCount               int64
description              object
diskUsage                 int64
dtype: object