# Reading CSV and TXT files

In [1]:
import pandas as pd

## Reading data with Python

In [3]:
with open('files/btc-market-price.csv', 'r') as fp:
    print(fp)

<_io.TextIOWrapper name='files/btc-market-price.csv' mode='r' encoding='UTF-8'>


In [4]:
with open('files/btc-market-price.csv', 'r') as fp:
    for index, line in enumerate(fp.readlines()):
        if index < 10:
            print(index, line)

0 2/4/17 0:00,1099.169125

1 3/4/17 0:00,1141.813

2 4/4/17 0:00,?

3 5/4/17 0:00,1133.079314

4 6/4/17 0:00,-

5 7/4/17 0:00,-

6 8/4/17 0:00,1181.149838

7 9/4/17 0:00,1208.8005

8 10/4/17 0:00,1207.744875

9 11/4/17 0:00,1226.617038



In [5]:
with open('files/btc-market-price.csv', 'r') as fp:
    for index, line in enumerate(fp.readlines()):
        if index < 10:
            timestamp, price = line.split(',')
            print(f"{timestamp}: ${price}")

2/4/17 0:00: $1099.169125

3/4/17 0:00: $1141.813

4/4/17 0:00: $?

5/4/17 0:00: $1133.079314

6/4/17 0:00: $-

7/4/17 0:00: $-

8/4/17 0:00: $1181.149838

9/4/17 0:00: $1208.8005

10/4/17 0:00: $1207.744875

11/4/17 0:00: $1226.617038



In [7]:
!head files/exam_review.csv

first_name>last_name>age>math_score>french_score
Ray>Morley>18>"68,000">"75,000"
Melvin>Scott>24>77>83
Amirah>Haley>22>92>67

Gerard>Mills>19>"78,000">72
Amy>Grimes>23>91>81


### The 'csv' module

In [8]:
import csv

In [11]:
with open('files/btc-market-price.csv', 'r') as fp:
    reader = csv.reader(fp)
    for index, (timestamp, price) in enumerate(reader):
        if index < 10:
            print(timestamp, '$' + price)

2/4/17 0:00 $1099.169125
3/4/17 0:00 $1141.813
4/4/17 0:00 $?
5/4/17 0:00 $1133.079314
6/4/17 0:00 $-
7/4/17 0:00 $-
8/4/17 0:00 $1181.149838
9/4/17 0:00 $1208.8005
10/4/17 0:00 $1207.744875
11/4/17 0:00 $1226.617038


In [15]:
with open('files/exam_review.csv', 'r') as fp:
    reader = csv.reader(fp, delimiter='>')
    for index, values in enumerate(reader):
        if not values:
            continue
        fname, lname, age, math, french = values
        print(values)

['first_name', 'last_name', 'age', 'math_score', 'french_score']
['Ray', 'Morley', '18', '68,000', '75,000']
['Melvin', 'Scott', '24', '77', '83']
['Amirah', 'Haley', '22', '92', '67']
['Gerard', 'Mills', '19', '78,000', '72']
['Amy', 'Grimes', '23', '91', '81']


In [16]:
csv_url = 'https://raw.githubusercontent.com/datasets/gdp/master/data/gdp.csv'

pd.read_csv(csv_url).head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1968,25760680000.0
1,Arab World,ARB,1969,28434200000.0
2,Arab World,ARB,1970,31385500000.0
3,Arab World,ARB,1971,36426910000.0
4,Arab World,ARB,1972,43316060000.0


### Header of csv

In [17]:
pd.read_csv('files/btc-market-price.csv').head()

Unnamed: 0,2/4/17 0:00,1099.169125
0,3/4/17 0:00,1141.813
1,4/4/17 0:00,?
2,5/4/17 0:00,1133.079314
3,6/4/17 0:00,-
4,7/4/17 0:00,-


In [19]:
df = pd.read_csv('files/btc-market-price.csv', header=None)

df.head()

Unnamed: 0,0,1
0,2/4/17 0:00,1099.169125
1,3/4/17 0:00,1141.813
2,4/4/17 0:00,?
3,5/4/17 0:00,1133.079314
4,6/4/17 0:00,-


### Missing values with 'na_values' parameter

In [21]:
df = pd.read_csv('files/btc-market-price.csv',
                 header=None,
                 na_values=['', '?', '-'])

df.head()

Unnamed: 0,0,1
0,2/4/17 0:00,1099.169125
1,3/4/17 0:00,1141.813
2,4/4/17 0:00,
3,5/4/17 0:00,1133.079314
4,6/4/17 0:00,


### Column names using 'names', parameter

In [22]:
df = pd.read_csv('files/btc-market-price.csv',
                 header=None,
                 na_values=['', '?', '-'],
                 names=['Timestamp', 'Price'])

df.head()

Unnamed: 0,Timestamp,Price
0,2/4/17 0:00,1099.169125
1,3/4/17 0:00,1141.813
2,4/4/17 0:00,
3,5/4/17 0:00,1133.079314
4,6/4/17 0:00,


### Column types using 'dtype' parameter

In [24]:
df = pd.read_csv('files/btc-market-price.csv',
                 header=None,
                 na_values=['', '?', '-'],
                 names=['Timestamp', 'Price'],
                 dtype={'Price': 'float'})

df.head()

Unnamed: 0,Timestamp,Price
0,2/4/17 0:00,1099.169125
1,3/4/17 0:00,1141.813
2,4/4/17 0:00,
3,5/4/17 0:00,1133.079314
4,6/4/17 0:00,


In [26]:
df.dtypes

Timestamp     object
Price        float64
dtype: object

### Date parser using 'parse_dates' parameter

In [27]:
df = pd.read_csv('files/btc-market-price.csv',
                 header=None,
                 na_values=['', '?', '-'],
                 names=['Timestamp', 'Price'],
                 dtype={'Price': 'float'},
                 parse_dates=[0])

df.head()

Unnamed: 0,Timestamp,Price
0,2017-02-04,1099.169125
1,2017-03-04,1141.813
2,2017-04-04,
3,2017-05-04,1133.079314
4,2017-06-04,


In [28]:
df.dtypes

Timestamp    datetime64[ns]
Price               float64
dtype: object

### Adding index to our data using 'index_col' parameter

In [29]:
df = pd.read_csv('files/btc-market-price.csv',
                 header=None,
                 na_values=['', '?', '-'],
                 names=['Timestamp', 'Price'],
                 dtype={'Price': 'float'},
                 parse_dates=[0],
                 index_col=[0])

df.head()

Unnamed: 0_level_0,Price
Timestamp,Unnamed: 1_level_1
2017-02-04,1099.169125
2017-03-04,1141.813
2017-04-04,
2017-05-04,1133.079314
2017-06-04,


In [30]:
df.dtypes

Price    float64
dtype: object

## A more challending parsing

In [32]:
exam_df = pd.read_csv('files/exam_review.csv')
exam_df

Unnamed: 0,Unnamed: 1,first_name>last_name>age>math_score>french_score
"Ray>Morley>18>""68","000"">""75","000"""
Melvin>Scott>24>77>83,,
Amirah>Haley>22>92>67,,
"Gerard>Mills>19>""78","000"">72",
Amy>Grimes>23>91>81,,


### Custom data delimiters using 'sep' parameter

In [33]:
exam_df = pd.read_csv('files/exam_review.csv',
                     sep='>')

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18,68000,75000
1,Melvin,Scott,24,77,83
2,Amirah,Haley,22,92,67
3,Gerard,Mills,19,78000,72
4,Amy,Grimes,23,91,81


### Custom numeric 'decimal' and 'throusands' character

In [34]:
exam_df = pd.read_csv('files/exam_review.csv',
                     sep='>')

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18,68000,75000
1,Melvin,Scott,24,77,83
2,Amirah,Haley,22,92,67
3,Gerard,Mills,19,78000,72
4,Amy,Grimes,23,91,81


In [35]:
exam_df[['math_score', 'french_score']].dtypes

math_score      object
french_score    object
dtype: object

In [36]:
exam_df = pd.read_csv('files/exam_review.csv',
                     sep='>',
                     decimal=',')

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18,68.0,75.0
1,Melvin,Scott,24,77.0,83.0
2,Amirah,Haley,22,92.0,67.0
3,Gerard,Mills,19,78.0,72.0
4,Amy,Grimes,23,91.0,81.0


In [37]:
exam_df[['math_score', 'french_score']].dtypes

math_score      float64
french_score    float64
dtype: object

In [38]:
exam_df = pd.read_csv('files/exam_review.csv',
                     sep='>',
                     thousands=',')

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18,68000,75000
1,Melvin,Scott,24,77,83
2,Amirah,Haley,22,92,67
3,Gerard,Mills,19,78000,72
4,Amy,Grimes,23,91,81


### Excluding specific rows

In [44]:
exam_df = pd.read_csv('files/exam_review.csv',
                      sep='>',
                      decimal=',')

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18,68.0,75.0
1,Melvin,Scott,24,77.0,83.0
2,Amirah,Haley,22,92.0,67.0
3,Gerard,Mills,19,78.0,72.0
4,Amy,Grimes,23,91.0,81.0


In [41]:
exam_df = pd.read_csv('files/exam_review.csv',
                      sep='>',
                      decimal=',',
                      skiprows=[1, 3])

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Melvin,Scott,24,77.0,83
1,Gerard,Mills,19,78.0,72
2,Amy,Grimes,23,91.0,81


In [42]:
exam_df = pd.read_csv('files/exam_review.csv',
                      sep='>',
                      decimal=',',
                      skiprows=2)

exam_df

Unnamed: 0,Melvin,Scott,24,77,83
0,Amirah,Haley,22,92.0,67
1,Gerard,Mills,19,78.0,72
2,Amy,Grimes,23,91.0,81


### Get rid of blank lines

In [45]:
exam_df = pd.read_csv('files/exam_review.csv',
                      sep='>',
                      decimal=',',
                      skip_blank_lines=False)

exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18.0,68.0,75.0
1,Melvin,Scott,24.0,77.0,83.0
2,Amirah,Haley,22.0,92.0,67.0
3,,,,,
4,Gerard,Mills,19.0,78.0,72.0
5,Amy,Grimes,23.0,91.0,81.0


### Loading specific columns

In [46]:
pd.read_csv('files/exam_review.csv',
            usecols=['first_name', 'last_name', 'age'],
            sep='>')

Unnamed: 0,first_name,last_name,age
0,Ray,Morley,18
1,Melvin,Scott,24
2,Amirah,Haley,22
3,Gerard,Mills,19
4,Amy,Grimes,23


In [47]:
pd.read_csv('files/exam_review.csv',
            usecols=[0, 1, 2],
            sep='>')

Unnamed: 0,first_name,last_name,age
0,Ray,Morley,18
1,Melvin,Scott,24
2,Amirah,Haley,22
3,Gerard,Mills,19
4,Amy,Grimes,23


### Using a 'Series' instead of 'DataFrame'

In [50]:
exam_test_1 = pd.read_csv('files/exam_review.csv',
                         sep='>',
                         usecols=['last_name'])

In [51]:
exam_test_1

Unnamed: 0,last_name
0,Morley
1,Scott
2,Haley
3,Mills
4,Grimes


In [52]:
type(exam_test_1)

pandas.core.frame.DataFrame

In [53]:
exam_test_2 = pd.read_csv('files/exam_review.csv',
                         sep='>',
                         usecols=['last_name'],
                         squeeze=True)

exam_test_2

type(exam_test_2)

pandas.core.series.Series

In [54]:
type(exam_test_2)

pandas.core.series.Series

In [55]:
exam_test_2

0    Morley
1     Scott
2     Haley
3     Mills
4    Grimes
Name: last_name, dtype: object

## Save to csv-file

In [56]:
exam_df

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18.0,68.0,75.0
1,Melvin,Scott,24.0,77.0,83.0
2,Amirah,Haley,22.0,92.0,67.0
3,,,,,
4,Gerard,Mills,19.0,78.0,72.0
5,Amy,Grimes,23.0,91.0,81.0


In [58]:
exam_df.to_csv()

',first_name,last_name,age,math_score,french_score\n0,Ray,Morley,18.0,68.0,75.0\n1,Melvin,Scott,24.0,77.0,83.0\n2,Amirah,Haley,22.0,92.0,67.0\n3,,,,,\n4,Gerard,Mills,19.0,78.0,72.0\n5,Amy,Grimes,23.0,91.0,81.0\n'

In [61]:
exam_df.to_csv('files/out1.csv')

In [62]:
exam_df.to_csv('files/out2.csv',
              index=None)

In [63]:
pd.read_csv('files/out.csv')

Unnamed: 0,first_name,last_name,age,math_score,french_score
0,Ray,Morley,18.0,68.0,75.0
1,Melvin,Scott,24.0,77.0,83.0
2,Amirah,Haley,22.0,92.0,67.0
3,,,,,
4,Gerard,Mills,19.0,78.0,72.0
5,Amy,Grimes,23.0,91.0,81.0
