# Coletando dados do GitHub

In [1]:
import requests

In [2]:
url = 'https://api.github.com/search/repositories'
url += '?q=language:python+sort:stars+stars:>10000'
headers = {'Accept': 'application/vnd.github.v3+json'}

In [3]:
r = requests.get(url, headers = headers)
r.status_code

200

In [4]:
response = r.json()
response.keys()

dict_keys(['total_count', 'incomplete_results', 'items'])

## Criando Pandas DataFrame com os dados coletados

In [17]:
# Cria dicionario
d = {'id':[], 'type':[], 'name': []}

# Popula dicionario com os dados da resposta da API
for item in response['items']:
    d[  'id'    ].append( item['id'] )
    d[  'type'  ].append( item['private'] )
    d[  'name'  ].append( item['name'])




In [18]:
import pandas as pd

pdDF = pd.DataFrame.from_dict(d, orient='columns')
pdDF.head()

Unnamed: 0,id,type,name
0,54346799,False,public-apis
1,83222441,False,system-design-primer
2,21289110,False,awesome-python
3,63476337,False,Python
4,123458551,False,Python-100-Days


# Preparando ambiente para PySpark

#### Dependências para Python 3.10

In [7]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
!tar xf spark-3.4.0-bin-hadoop3.tgz
!pip install -q findspark

#### Configuração das Variáveis de Ambiente

In [8]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

#### Tornar PySpark Importável

In [9]:
# tornar o pyspark "importável"
import findspark
findspark.init('spark-3.4.0-bin-hadoop3')

## Iniciando Ambiente

In [10]:
# iniciar uma sessão local e importar dados do Airbnb
from pyspark.sql import SparkSession
sc = SparkSession.builder.master('local[*]').getOrCreate()

## Carregando Pandas DF para PySpark DataFrame

In [19]:
df = sc.createDataFrame(pdDF)

In [20]:
df.show(3)

+--------+-----+--------------------+
|      id| type|                name|
+--------+-----+--------------------+
|54346799|false|         public-apis|
|83222441|false|system-design-primer|
|21289110|false|      awesome-python|
+--------+-----+--------------------+
only showing top 3 rows



In [45]:
df_transposed = df.groupBy('name').pivot('type').agg(F.first(F.array('id'))[0])

df_transposed.show(5)

+--------------------+--------+
|                name|   false|
+--------------------+--------+
|        scikit-learn|  843222|
|         public-apis|54346799|
|system-design-primer|83222441|
|                core|12888993|
|               manim|32689863|
+--------------------+--------+
only showing top 5 rows



In [43]:
import pyspark.sql.functions as F

# Transpose the DataFrame
df_group = df.groupBy('type').agg(F.count('*').alias('qtd')).orderBy('qtd')

# Show the transposed DataFrame
df_group.show(3)


+-----+---+
| type|qtd|
+-----+---+
|false| 30|
+-----+---+



In [44]:
df_transposed = df_group.groupBy('qtd').pivot('type').agg(F.first(F.array('qtd')))
df_transposed.show(5)

+---+-----+
|qtd|false|
+---+-----+
| 30| [30]|
+---+-----+



In [40]:
df_transposed.select(F.arrays_zip(*df_transposed.columns).alias('az')).selectExpr('inline(az)').show(5)

AnalysisException: ignored