In [1]:
import pandas as pd
import requests

url = "https://drive.google.com/uc?id=1nfktbI7ucHOUwO6EHD2pPHvOf1hN9nmZ"

destination_file = "videogame.csv"
response = requests.get(url)

if response.status_code == 200:
    # Save the content of the file to a local file
    with open(destination_file, "wb") as file:
        file.write(response.content)
    print(f"The file {destination_file} has been successfully downloaded.")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

# Load the CSV file into a DataFrame
df = pd.read_csv(destination_file)

print(df.head())

The file videogame.csv has been successfully downloaded.
                           Nombre  3DS   PC  PS3  PSP  PSV X360  PS4  XOne  \
0                          ZombiU  NaN  NaN  NaN  NaN  NaN  NaN  PS4   NaN   
1             Zombie Army Trilogy  NaN  NaN  NaN  NaN  NaN  NaN  PS4   NaN   
2             Zombie Army Trilogy  NaN  NaN  NaN  NaN  NaN  NaN  NaN  XOne   
3  Zero Escape: Zero Time Dilemma  3DS  NaN  NaN  NaN  NaN  NaN  NaN   NaN   
4  Zero Escape: Zero Time Dilemma  NaN  NaN  NaN  NaN  PSV  NaN  NaN   NaN   

    Año                Genero-Editorial  Unnamed2 Ventas NA Ventas EU  \
0  2016                  Action-Ubisoft       NaN         0      0,04   
1  2015  Shooter-Rebellion Developments       NaN      0,04      0,12   
2  2015  Shooter-Rebellion Developments       NaN      0,04      0,05   
3  2016           Adventure-Aksys Games       NaN      0,04         0   
4  2016           Adventure-Aksys Games       NaN      0,03         0   

  Ventas JP Ventas Otros Ventas Glo

In [2]:
df.describe()

Unnamed: 0,Año,Unnamed2
count,955.0,0.0
mean,2015.383246,
std,0.492855,
min,2015.0,
25%,2015.0,
50%,2015.0,
75%,2016.0,
max,2017.0,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955 entries, 0 to 954
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Nombre            955 non-null    object 
 1   3DS               125 non-null    object 
 2   PC                91 non-null     object 
 3   PS3               109 non-null    object 
 4   PSP               3 non-null      object 
 5   PSV               182 non-null    object 
 6   X360              53 non-null     object 
 7   PS4               257 non-null    object 
 8   XOne              135 non-null    object 
 9   Año               955 non-null    int64  
 10  Genero-Editorial  955 non-null    object 
 11  Unnamed2          0 non-null      float64
 12  Ventas NA         955 non-null    object 
 13  Ventas EU         955 non-null    object 
 14  Ventas JP         955 non-null    object 
 15  Ventas Otros      955 non-null    object 
 16  Ventas Global     955 non-null    object 
dt

In [4]:
df.head()

Unnamed: 0,Nombre,3DS,PC,PS3,PSP,PSV,X360,PS4,XOne,Año,Genero-Editorial,Unnamed2,Ventas NA,Ventas EU,Ventas JP,Ventas Otros,Ventas Global
0,ZombiU,,,,,,,PS4,,2016,Action-Ubisoft,,0,4,1,1,5
1,Zombie Army Trilogy,,,,,,,PS4,,2015,Shooter-Rebellion Developments,,4,12,0,3,2
2,Zombie Army Trilogy,,,,,,,,XOne,2015,Shooter-Rebellion Developments,,4,5,0,1,1
3,Zero Escape: Zero Time Dilemma,3DS,,,,,,,,2016,Adventure-Aksys Games,,4,0,1,1,6
4,Zero Escape: Zero Time Dilemma,,,,,PSV,,,,2016,Adventure-Aksys Games,,3,0,2,1,6


In [5]:
duplicate_rows = df.duplicated(keep='first')  # keep='first' marks all but the first occurrence as duplicates

# Display duplicate rows
if duplicate_rows.any():
    duplicate_rows_indices = df[duplicate_rows].index
    print(f"Duplicate rows found at indices: {duplicate_rows_indices}")
else:
    print("No duplicate rows found.")

Duplicate rows found at indices: Index([ 36,  73, 101, 116, 141, 165, 172, 185, 195, 271, 289, 340, 356, 388,
       436, 441, 444, 454, 513, 545, 546, 555, 562, 568, 639, 651, 661, 663,
       684, 722, 754, 826, 844, 852, 908, 937],
      dtype='int64')


In [6]:
""" 
Observed features:
1 A null column exists.
2 There are 36 duplicated rows
3 There are columns that are ordered in long format.
4 There is a concatenated column "Genero-Editorial"
4 Columns that have prices and numerical values are loaded as objects, not floats.
 """

' \nObserved features:\n1 A null column exists.\n2 There are 36 duplicated rows\n3 There are columns that are ordered in long format.\n4 There is a concatenated column "Genero-Editorial"\n4 Columns that have prices and numerical values are loaded as objects, not floats.\n '

In [7]:
# 1 A null column exists/ we remove null columns
df = df.dropna(axis=1, how='all')

In [8]:
#1 There are 36 duplicated rows/ we remove duplicates
df = df.drop_duplicates()

In [9]:
df.columns

Index(['Nombre', '3DS', 'PC', 'PS3', 'PSP', 'PSV', 'X360', 'PS4', 'XOne',
       'Año', 'Genero-Editorial', 'Ventas NA', 'Ventas EU', 'Ventas JP',
       'Ventas Otros', 'Ventas Global'],
      dtype='object')

In [10]:
#3 There are columns that are ordered in long format /generate code to change the long format column to short, 1º one by one 
df_3DS = df.dropna(subset=['3DS'])
df_3DS = df_3DS.dropna(axis=1, how='all')
df_3DS=df_3DS.rename(columns={'3DS': 'Plataforma'})

df_3DS.head()

Unnamed: 0,Nombre,Plataforma,Año,Genero-Editorial,Ventas NA,Ventas EU,Ventas JP,Ventas Otros,Ventas Global
3,Zero Escape: Zero Time Dilemma,3DS,2016,Adventure-Aksys Games,4,0,1,1,6
8,Yowamushi Pedal,3DS,2015,Action-Namco Bandai Games,0,0,4,0,4
13,Yokai Watch Busters,3DS,2015,Action-Level 5,0,0,228,0,228
14,Yokai Watch 3,3DS,2016,Action-Level 5,0,0,127,0,127
15,Yokai Sangokushi,3DS,2016,Action-Level 5,0,0,55,0,55


In [11]:
#3 There are columns that are ordered in long format / there are 8 columns in this format, for this we made a function
def Long_to_Short(df): #Defino mi función
    dataframes = {}  # Genero un diccionario
    dataframe_names = []  # Lista para almacenar los nombres de los DataFrames
        
    i = 1

    for columna in df.columns:
        if df[columna].isnull().any():
            df_temp = df.dropna(subset=[columna])
            df_temp = df_temp.dropna(axis=1, how='all')
            df_temp = df_temp.rename(columns={columna: 'Plataforma'})

            df_name = f'DF_{i}'  # Nombre único para el DataFrame
            dataframes[df_name] = df_temp
            dataframe_names.append(df_name)  # Agregar el nombre a la lista
            i += 1  # Incrementar el contador

    return dataframes, dataframe_names

In [12]:
dataframes, dataframe_names = Long_to_Short(df)

dataframes_list = []

for df_name in dataframe_names:
    df_temp = dataframes[df_name]
    dataframes_list.append(df_temp)

# Concatena los DataFrames de manera vertical
df2 = pd.concat(dataframes_list, axis=0)

In [13]:
df2

Unnamed: 0,Nombre,Plataforma,Año,Genero-Editorial,Ventas NA,Ventas EU,Ventas JP,Ventas Otros,Ventas Global
3,Zero Escape: Zero Time Dilemma,3DS,2016,Adventure-Aksys Games,004,0,001,001,006
8,Yowamushi Pedal,3DS,2015,Action-Namco Bandai Games,0,0,004,0,004
13,Yokai Watch Busters,3DS,2015,Action-Level 5,0,0,228,0,228
14,Yokai Watch 3,3DS,2016,Action-Level 5,0,0,127,0,127
15,Yokai Sangokushi,3DS,2016,Action-Level 5,0,0,055,0,055
...,...,...,...,...,...,...,...,...,...
912,Assassin's Creed Syndicate,XOne,2015,Action-Ubisoft,061,056,0,011,128
917,Assassin's Creed Chronicles,XOne,2016,Action-Ubisoft,0,001,0,0,001
918,Arslan: The Warriors of Legend,XOne,2016,Action-Tecmo Koei,001,0,0,0,002
942,Adventure Time: Finn & Jake Investigations,XOne,2015,Action-Little Orbit,002,002,0,0,005


In [14]:
#4 There is a concatenated column/ We separate it using right split, because there is a type of genre that uses a - on the left side.
df2[['Genero', 'Editorial']] = df2['Genero-Editorial'].str.rsplit('-', n=1, expand=True)
df2 = df2.drop('Genero-Editorial', axis=1)
df2.head()


Unnamed: 0,Nombre,Plataforma,Año,Ventas NA,Ventas EU,Ventas JP,Ventas Otros,Ventas Global,Genero,Editorial
3,Zero Escape: Zero Time Dilemma,3DS,2016,4,0,1,1,6,Adventure,Aksys Games
8,Yowamushi Pedal,3DS,2015,0,0,4,0,4,Action,Namco Bandai Games
13,Yokai Watch Busters,3DS,2015,0,0,228,0,228,Action,Level 5
14,Yokai Watch 3,3DS,2016,0,0,127,0,127,Action,Level 5
15,Yokai Sangokushi,3DS,2016,0,0,55,0,55,Action,Level 5


In [15]:
# check the columns split
valores_unicos_genero = df2['Genero'].unique()
valores_unicos_editorial = df2['Editorial'].unique()

# Imprime los valores únicos en 'Genero' y 'Editorial'
print("Valores únicos en 'Genero':")
print(valores_unicos_genero)

print("Valores únicos en 'Editorial':")
print(valores_unicos_editorial)

Valores únicos en 'Genero':
['Adventure' 'Action' 'Role-Playing' 'Misc' 'Simulation' 'Platform'
 'Strategy' 'Puzzle' 'Fighting' 'Sports' 'Strategy-Take' 'Racing'
 'Shooter' 'Sports-Take' 'Action-Take' 'Shooter-Take']
Valores únicos en 'Editorial':
['Aksys Games' 'Namco Bandai Games' 'Level 5' 'Nintendo'
 'Avanquest Software' 'Square Enix' 'Activision' 'FuRyu' '505 Games'
 'Nippon Columbia' 'Marvelous Interactive' 'Atlus' 'Happinet' 'Sega'
 'Yacht Club Games' 'Deep Silver' 'Intergrow' 'Tecmo Koei'
 'Nippon Ichi Software' 'Marvelous Entertainment' 'GungHo' 'Takara Tomy'
 'Koch Media' 'mixi, Inc' 'Capcom' 'Little Orbit' 'Rocket Company'
 'Warner Bros. Interactive Entertainment' 'Extreme Entertainment Group'
 'Screenlife' 'Unknown' 'Ubisoft' 'WayForward Technologies' 'Alchemist'
 'Arc System Works' 'Shogakukan' '5pb' 'Inti Creates' 'Two Interactive'
 'Bethesda Softworks' 'inXile Entertainment' 'Focus Home Interactive'
 'Electronic Arts' 'Rondomedia' 'Paradox Interactive' 'Focus Multimedia'

In [16]:
#4 Columns that have prices and numerical values are loaded as objects, not floats.
df2['Ventas NA'] = df2['Ventas NA'].str.replace(',', '.', regex=True).astype(float)
df2['Ventas EU'] = df2['Ventas EU'].str.replace(',', '.', regex=True).astype(float)
df2['Ventas JP'] = df2[ 'Ventas JP'].str.replace(',', '.', regex=True).astype(float)
df2['Ventas Otros'] = df2['Ventas Otros'].str.replace(',', '.', regex=True).astype(float)
df2['Ventas Global'] = df2['Ventas Global'].str.replace(',', '.', regex=True).astype(float)

In [17]:
# Groups are generated through SQL in python using the sqlite3 library
import sqlite3

result = df2.query("Genero == 'Action'")
result.head()

Unnamed: 0,Nombre,Plataforma,Año,Ventas NA,Ventas EU,Ventas JP,Ventas Otros,Ventas Global,Genero,Editorial
8,Yowamushi Pedal,3DS,2015,0.0,0.0,0.04,0.0,0.04,Action,Namco Bandai Games
13,Yokai Watch Busters,3DS,2015,0.0,0.0,2.28,0.0,2.28,Action,Level 5
14,Yokai Watch 3,3DS,2016,0.0,0.0,1.27,0.0,1.27,Action,Level 5
15,Yokai Sangokushi,3DS,2016,0.0,0.0,0.55,0.0,0.55,Action,Level 5
121,Thomas and Friends: Steaming around Sodor,3DS,2015,0.0,0.02,0.0,0.0,0.02,Action,Avanquest Software


In [18]:
!pip install google-cloud-bigquery



In [19]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Reemplaza 'tu-archivo-de-credenciales.json' con la ruta al archivo JSON de credenciales de tu cuenta de servicio.
credentials = service_account.Credentials.from_service_account_file('C:\\Users\\REMO\\Desktop\\Analisis de Datos\\Poyecto Procesamiento\\TDGlobant\\globanttd-a23103587ee1.json')

# Crea un cliente de BigQuery.
client = bigquery.Client(credentials=credentials)

In [20]:
from pandas_gbq import to_gbq
# Especifica el ID del proyecto y el nombre del conjunto de datos y la tabla en BigQuery donde deseas cargar los datos.
project_id = 'globanttd'
dataset_id = 'Videogames'
table_id = 'videogames1'

job_config = bigquery.LoadJobConfig()
job_config.write_disposition = 'WRITE_TRUNCATE'  # Opcional: Sobrescribe la tabla existente.

# Carga el DataFrame en BigQuery.
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)

job = client.load_table_from_dataframe(df2, table_ref, location='US', job_config=job_config)
job.result()  # Espera a que se complete el trabajo de carga.

print(f'Datos cargados en {project_id}.{dataset_id}.{table_id}')


Datos cargados en globanttd.Videogames.videogames1


In [22]:
import pandas_gbq
print(pandas_gbq.__version__)

0.19.2


In [25]:
pip install apache-airflow

Collecting apache-airflow
  Obtaining dependency information for apache-airflow from https://files.pythonhosted.org/packages/4e/7b/498a6d2f8a95b35b979763fe27a1e6a162f4d4de5e79e30655f21b8e0458/apache_airflow-2.7.2-py3-none-any.whl.metadata
  Downloading apache_airflow-2.7.2-py3-none-any.whl.metadata (102 kB)
     ---------------------------------------- 0.0/102.4 kB ? eta -:--:--
     -------------------------------------- 102.4/102.4 kB 6.1 MB/s eta 0:00:00
Collecting alembic<2.0,>=1.6.3 (from apache-airflow)
  Obtaining dependency information for alembic<2.0,>=1.6.3 from https://files.pythonhosted.org/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl.metadata
  Downloading alembic-1.12.0-py3-none-any.whl.metadata (7.2 kB)
Collecting argcomplete>=1.10 (from apache-airflow)
  Obtaining dependency information for argcomplete>=1.10 from https://files.pythonhosted.org/packages/1e/05/223116a4a5905d6b26bff334ffc7b74474fafe23fcb10532ca