In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras

print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.4
pandas 0.24.2
sklearn 0.21.2
tensorflow 2.1.0
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf


In [9]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)

<TensorSliceDataset shapes: (), types: tf.int32>


In [10]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [11]:
#1. repeat epoch
#2. get batch
dataset = dataset.repeat(2).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9], shape=(6,), dtype=int32)


In [12]:
#interleave
#case: 文件dataset -> 具体数据集
dataset2 = dataset.interleave(
    lambda v: tf.data.Dataset.from_tensor_slices(v),
    cycle_length = 5,
    block_length = 5
 )
for item in dataset2:
    print(item)
#输出结果：每次从每组data中取出blocklength来处理产生新数据集

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [15]:
x = np.array([[1,2],[3,4],[5,6]])

y = np.array(['cat','dog','fox'])

dataset3 = tf.data.Dataset.from_tensor_slices((x,y))
print(dataset3)

for item_x,item_y in dataset3:
    print(item_x.numpy(),item_y.numpy())

<TensorSliceDataset shapes: ((2,), ()), types: (tf.int32, tf.string)>
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'


In [19]:
# 这类似dataFrame
dataset4 = tf.data.Dataset.from_tensor_slices({'feature': x,
                                            'label': y})
print(dataset4)

for item in dataset4:
    print(item['feature'].numpy(),item['label'].numpy())

<TensorSliceDataset shapes: {feature: (2,), label: ()}, types: {feature: tf.int32, label: tf.string}>
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'


In [22]:
#导入数据集
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)
print('-'*50)
# 划分数据集
from sklearn.model_selection import train_test_split
#默认训练集：测试集 =3 ：1 
x_train_all,x_test,y_train_all,y_test = train_test_split(housing.data,housing.target,random_state = 7,test_size = 0.25)
x_train,x_valid,y_train,y_valid = train_test_split(x_train_all,y_train_all,random_state = 11,test_size = 0.25)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)
print('-'*50)
#数据归一化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled  = scaler.transform(x_test)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [32]:
#生成csv文件
output_dir = 'generate_csv'
if not  os.path.exists(output_dir):
    os.mkdir(output_dir)
def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    for file_idx,row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding = 'utf-8') as f:
            if header is not None:
                f.write(header+"\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write('\n')
    return filenames

train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filename = save_to_csv(output_dir,train_data,'train',header_str,n_parts=20)
valid_filename = save_to_csv(output_dir,valid_data,'valid',header_str,n_parts=20)
test_filename = save_to_csv(output_dir,test_data,'test',header_str,n_parts=20)

In [23]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
