# Data Loading, Storage


In [None]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [None]:
# You have to upload the data by using the Files function on your dashboard

## Reading and Writing Data in Text Format

two methods:
- *read_csv*: Read a comma-separated values (csv) file into DataFrame. Also supports optionally iterating or breaking of the file into chunks.
- *to_csv*: Write object to a comma-separated values (csv) file.

### Reading Text Files

In [None]:
df= pd.read_csv("ex1.csv", sep=',')
df

In [None]:
type(df)

In [None]:
 !cat "ex1.csv"
 #!cat "examples/ex1.csv"

define separator value and header

In [None]:
# define the table sepator value

df = pd.read_csv('ex1.csv', sep='\t')
df

In [None]:
# define if the table header exists 

df = pd.read_csv('ex1.csv', header=None)
df

In [None]:
 !cat "ex2.csv"

In [None]:
pd.read_csv('ex2.csv', sep=',', header=None)

define column name and/or index name

In [None]:
# define column name

pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

In [None]:
# define index name

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message')

you can skip a list of row

In [None]:
!cat "ex4.csv"

In [None]:
pd.read_csv('ex4.csv', sep=',')

In [None]:
# you can skip a list of row
pd.read_csv('ex4.csv', skiprows=[0, 2, 3])

In [None]:
!cat ex5.csv

In [None]:
result = pd.read_csv('ex5.csv')
result

In [None]:
None; np.nan

In [None]:
pd.isnull(result)

define *na_values*

In [None]:
result = pd.read_csv('ex5.csv', na_values=['NULL', 'one'])
result

In [None]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels)
#pd.read_csv('examples/ex5.csv', na_values=sentinels)

### Reading Text Files in Pieces

In [None]:
result = pd.read_csv('ex6.csv')
result

Read only a fixed number of rows

In [None]:
pd.read_csv('ex6.csv', nrows=5)

Read Text Files in chunks

In [None]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker

In [None]:
chunk = next(chunker)
chunk.head(5)

In [None]:
for chunk in chunker:
  print("chunk size: {}".format(chunk.shape))
  break

In [None]:
# tot = pd.Series([], dtype=float)
# for piece in chunker:
#     tot = tot.add(piece['key'].value_counts(), fill_value=0)

# tot = tot.sort_values(ascending=False)

### Writing Data to Text Format

In [None]:
data = pd.read_csv('ex5.csv')
data

In [None]:
data.iloc[0,0] = 999

In [None]:
data

In [None]:
data.to_csv('out.csv',index=False)

In [None]:
pd.read_csv('out.csv', sep=',')

In [None]:
!cat out.csv

In [None]:
data.to_csv('out.csv', index=False, sep='#')

In [None]:
pd.read_csv('out.csv', sep=';')

In [None]:
!cat out.csv

### Reading and Writing Microsoft Excel Files

Read option 1: Using *ExcelFile* class

In [None]:
#xlsx = pd.ExcelFile('examples/ex1.xlsx')
xlsx = pd.ExcelFile('ex1.xlsx')

In [None]:
xlsx

In [None]:
pd.read_excel(xlsx, 'Sheet1')

Read option 2: Using *read_excel* method

In [None]:
frame = pd.read_excel('ex1.xlsx', 'Sheet1')
frame

Write option 1: using *ExcelWriter* object

In [None]:
writer = pd.ExcelWriter('ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

Write option 2: using *to_excel* method

In [None]:
frame

In [None]:
frame.to_excel('ex2_out.xlsx')

In [None]:
!rm examples/ex2.xlsx

### Optional: Working with Delimited Formats

In [None]:
!cat ex7.csv

In [None]:
import csv
f = open('ex7.csv')

reader = csv.reader(f)

In [None]:
for line in reader:
    print(line)

In [None]:
with open('ex7.csv') as f:
    lines = list(csv.reader(f))

In [None]:
header, values = lines[0], lines[1:]

In [None]:
values

In [None]:
header

In [None]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

## Optional

In [None]:
with open('ex3.txt', 'r') as f:
  lines = f.readlines()
lines

In [None]:
columns = [val.strip() for val in lines[0].split(' ') if val != '']
columns

In [None]:
data = []
for row in lines[1:]:
  res = [val.strip() for val in row.split(' ') if val != '']
  data.append(res)

In [None]:
data

In [None]:
columns

In [None]:
columns.insert(0, 'index')
columns

In [None]:
ds = pd.DataFrame(data, columns=columns)
ds

In [None]:
ds.set_index('index')