# Coletando dados do GitHub

In [1]:
import requests

In [2]:
url = 'https://api.github.com/search/repositories'
url += '?q=language:python+sort:stars+stars:>10000'
headers = {'Accept': 'application/vnd.github.v3+json'}

In [3]:
r = requests.get(url, headers = headers)
r.status_code

200

In [4]:
response = r.json()
response.keys()

dict_keys(['total_count', 'incomplete_results', 'items'])

## Criando Pandas DataFrame com os dados coletados

In [5]:
# Cria dicionario
d = {'id':[], 'data':[]}

# Popula dicionario com os dados da resposta da API
for item in response['items']:
    d[  'id'    ].append( item['id'] )
    d[  'data'  ].append( item )
    #d[  'data'  ].append( "{"+f"{item}"+"}" )


# Visualizar primeiros valores do dicionario
for i in range(2):
    print( d['data'][i] )

{'id': 54346799, 'node_id': 'MDEwOlJlcG9zaXRvcnk1NDM0Njc5OQ==', 'name': 'public-apis', 'full_name': 'public-apis/public-apis', 'private': False, 'owner': {'login': 'public-apis', 'id': 51121562, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjUxMTIxNTYy', 'avatar_url': 'https://avatars.githubusercontent.com/u/51121562?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/public-apis', 'html_url': 'https://github.com/public-apis', 'followers_url': 'https://api.github.com/users/public-apis/followers', 'following_url': 'https://api.github.com/users/public-apis/following{/other_user}', 'gists_url': 'https://api.github.com/users/public-apis/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/public-apis/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/public-apis/subscriptions', 'organizations_url': 'https://api.github.com/users/public-apis/orgs', 'repos_url': 'https://api.github.com/users/public-apis/repos', 'events_url': 'https://api.github.com/users/public

In [6]:
import pandas as pd

pdDF = pd.DataFrame.from_dict(d, orient='columns')
pdDF.head()

Unnamed: 0,id,data
0,54346799,"{'id': 54346799, 'node_id': 'MDEwOlJlcG9zaXRvc..."
1,83222441,"{'id': 83222441, 'node_id': 'MDEwOlJlcG9zaXRvc..."
2,21289110,"{'id': 21289110, 'node_id': 'MDEwOlJlcG9zaXRvc..."
3,63476337,"{'id': 63476337, 'node_id': 'MDEwOlJlcG9zaXRvc..."
4,123458551,"{'id': 123458551, 'node_id': 'MDEwOlJlcG9zaXRv..."


# Preparando ambiente para PySpark

#### Dependências para Python 3.10

In [7]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
!tar xf spark-3.4.0-bin-hadoop3.tgz
!pip install -q findspark

#### Configuração das Variáveis de Ambiente

In [8]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

#### Tornar PySpark Importável

In [9]:
# tornar o pyspark "importável"
import findspark
findspark.init('spark-3.4.0-bin-hadoop3')

## Iniciando Ambiente

In [10]:
# iniciar uma sessão local e importar dados do Airbnb
from pyspark.sql import SparkSession
sc = SparkSession.builder.master('local[*]').getOrCreate()

## Carregando Pandas DF para PySpark DataFrame

In [11]:
df = sc.createDataFrame(pdDF)

In [12]:
df.show(3)

+--------+--------------------+
|      id|                data|
+--------+--------------------+
|54346799|{allow_forking ->...|
|83222441|{allow_forking ->...|
|21289110|{allow_forking ->...|
+--------+--------------------+
only showing top 3 rows



### Transorma dados Map em Texto

In [24]:
from pyspark.sql.functions import col
from pyspark.sql.functions import cast
df = df.withColumn('data', col('data').cast('string'))

# Criando Classe para transformação

In [38]:
import json
from typing import TypeAlias
from dataclasses import dataclass
from pyspark.sql.functions import map_keys, map_values

In [45]:
@dataclass
class JsonCleaner:

    std_dict: dict
    target_column: str
    search_keys: TypeAlias = list[str]

    def has_keys(self, obj):
        print(map_keys(obj))
        return any( key in self.search_keys for key in map_keys(obj) )

    def append_values_to_std_dict(self, obj, keys):
        for key in keys:
            self.std_dict[key].append(obj[key])

    def search(self, obj):
        document = obj[self.target_column]
        if self.has_keys( document ):
            l = [ key if key in self.search_keys else None for key in map_keys(document) ]
        self.append_values_to_std_dict(document, l)
        return std_dict

    def __call__(self, data_frame):
        return data_frame.transform(self.search)

In [46]:
std_dict = {'name': [], 'owner': [], 'html_url': [], 'description':[]}
search_keys = ['name', 'owner', 'html_url', 'description']

json_cleaner = JsonCleaner( target_column = 'id', search_keys = search_keys, std_dict = std_dict)

In [47]:
df_cleaned = df.transform(json_cleaner)
df_cleaned.show(5)

Column<'map_keys(id)'>


TypeError: ignored