# 01 - Import Dataset   

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Tic-Tac-Toe"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

In [2]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [3]:
import zipfile

BASE_URL = "https://archive.ics.uci.edu/static/public/101/tic+tac+toe+endgame.zip"
target = f"{ROOT}/orig/tic+tac+toe+endgame.zip"

if not os.path.isfile(target):
    print("Downloading remote zip file")
    urllib.request.urlretrieve(BASE_URL, target)
    zipfile.ZipFile(target, "r").extractall(f"{ROOT}/orig")
    print("Zip file contents extracted successfully.")
else:
    print("Using local copy of files")



Using local copy of files


In [4]:
columns = [
    "top-left-square", "top-middle-square", "top-right-square",
    "middle-left-square", "middle-middle-square", "middle-right-square",
    "bottom-left-square", "bottom-middle-square", "bottom-right-square", "score"
]
columns = [c[0].upper()+c[1:] for c in columns]

In [5]:
df = pd.read_csv(f"{ROOT}/orig/tic-tac-toe.data", header=None, names=columns)
print(df.shape)
df.head()

(958, 10)


Unnamed: 0,Top-left-square,Top-middle-square,Top-right-square,Middle-left-square,Middle-middle-square,Middle-right-square,Bottom-left-square,Bottom-middle-square,Bottom-right-square,Score
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Top-left-square       958 non-null    object
 1   Top-middle-square     958 non-null    object
 2   Top-right-square      958 non-null    object
 3   Middle-left-square    958 non-null    object
 4   Middle-middle-square  958 non-null    object
 5   Middle-right-square   958 non-null    object
 6   Bottom-left-square    958 non-null    object
 7   Bottom-middle-square  958 non-null    object
 8   Bottom-right-square   958 non-null    object
 9   Score                 958 non-null    object
dtypes: object(10)
memory usage: 75.0+ KB


In [7]:
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].map({'x':1, 'o':-1,'b':0, 'positive':1,'negative':-1})
        df[c] = df[c].astype("category")
df.head(2)

Unnamed: 0,Top-left-square,Top-middle-square,Top-right-square,Middle-left-square,Middle-middle-square,Middle-right-square,Bottom-left-square,Bottom-middle-square,Bottom-right-square,Score
0,1,1,1,1,-1,-1,1,-1,-1,1
1,1,1,1,1,-1,-1,-1,1,-1,1


In [8]:
df.to_pickle(f"{ROOT}/data/data.pkl")

Salvando o dataframe como Pickle, pois é uma maneira serializada de armazenar um dataframe Pandas. Basicamente, você está anotando a representação exata do dataframe no disco. 
- mais rápido
- menos pesado
- não possibilita "leitura humana", e por isso, como mais adiante fizemos uma análise mais aprofundada nos dados, salvamos também como .csv para leitura. Mas não haveria necessidade.