Opening a csv file from an URl

In [2]:
import requests
import pandas as pd
from io import StringIO

#Url of the Csv file
url = "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"

#fetch the CSV data from the Url
response = requests.get(url)

# Use StringIO to read the CSV data into pandas
data = pd.read_csv(StringIO(response.text))

#Display the first few rows of the data
data.head(10)




Unnamed: 0,Month,"""1958""","""1959""","""1960"""
0,JAN,340,360,417
1,FEB,318,342,391
2,MAR,362,406,419
3,APR,348,396,461
4,MAY,363,420,472
5,JUN,435,472,535
6,JUL,491,548,622
7,AUG,505,559,606
8,SEP,404,463,508
9,OCT,359,407,461


Using SEP parameter


In [None]:
tsvdata = pd.read_csv("mtcars.tsv",sep = "\t")
tsvdata.head(10)

In [6]:
tsvdata = pd.read_csv("example_with_out_col_names.tsv",sep = '\t',names = ["Name","Age","From"])
tsvdata.head()

Unnamed: 0,Name,Age,From
0,John 23 New York,,
1,Jane 29 Los Angeles,,
2,Doe 31 Chicago,,


Index col parameter


In [9]:
data= pd.read_csv("index_col_parameter.csv",index_col="id")
data

Unnamed: 0_level_0,name,age,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,John Doe,28,New York
2,Jane Smith,32,Los Angeles
3,Emily Davis,22,Chicago
4,Michael Brown,45,Houston
5,Jessica Johnson,37,Phoenix


Header Parameter


In [10]:
data= pd.read_csv("index_col_parameter.csv",header=1)
data

Unnamed: 0,1,John Doe,28,New York
0,2,Jane Smith,32,Los Angeles
1,3,Emily Davis,22,Chicago
2,4,Michael Brown,45,Houston
3,5,Jessica Johnson,37,Phoenix


Use Cols Parameter

In [12]:
data= pd.read_csv("index_col_parameter.csv",usecols=["id",'name','age'])
data

Unnamed: 0,id,name,age
0,1,John Doe,28
1,2,Jane Smith,32
2,3,Emily Davis,22
3,4,Michael Brown,45
4,5,Jessica Johnson,37


Squeeze Parameter.


In [15]:
data= pd.read_csv("index_col_parameter.csv",usecols=["name"])
data= data.squeeze()
data

0           John Doe
1         Jane Smith
2        Emily Davis
3      Michael Brown
4    Jessica Johnson
Name: name, dtype: object

Skiprows/nrows parameter

In [22]:
data= pd.read_csv("index_col_parameter.csv",skiprows=[0,3,4])
data

Unnamed: 0,1,John Doe,28,New York
0,2,Jane Smith,32,Los Angeles
1,5,Jessica Johnson,37,Phoenix


In [23]:
data= pd.read_csv("index_col_parameter.csv",nrows=1)
data

Unnamed: 0,id,name,age,city
0,1,John Doe,28,New York


Encoding parameter

In [27]:
data= pd.read_csv("encoding_error_data (2).csv", encoding="latin-1")
data

Unnamed: 0,name,age,city
0,John,23,New York
1,Jane,29,Los Angeles
2,Doe,31,Chicago
3,?????,25,??????


Skip Bad lines

In [32]:
data=pd.read_csv("error_bad_lines.csv",on_bad_lines = 'skip')
data

Unnamed: 0,name,age,city
0,John,23,New York
1,Jane,29,Los Angeles
2,Doe,31,Chicago
3,Bob,34,Miami


dtypes parameter

In [33]:
data=pd.read_csv("error_bad_lines.csv",on_bad_lines = 'skip',dtype={'age':float})
data

Unnamed: 0,name,age,city
0,John,23.0,New York
1,Jane,29.0,Los Angeles
2,Doe,31.0,Chicago
3,Bob,34.0,Miami


Handling Dates

In [37]:
data = pd.read_csv("parese_date.csv",parse_dates=['date'])
data

Unnamed: 0,date,temperature
0,2025-01-01,30
1,2025-01-02,31
2,2025-01-03,29
3,2025-01-04,32
4,2025-01-05,28


Convertors

In [38]:
def rename(name):
    if name == "MS Dhoni":
        return "THala"
    else:
        return name
ipldata= pd.read_csv("ipl_data.csv",converters={"Player_Name":rename})
ipldata.head(10)


Unnamed: 0,Year,Player_Name,Matches_Batted,Not_Outs,Runs_Scored,Highest_Score,Batting_Average,Balls_Faced,Batting_Strike_Rate,Centuries,...,Matches_Bowled,Balls_Bowled,Runs_Conceded,Wickets_Taken,Best_Bowling_Match,Bowling_Average,Economy_Rate,Bowling_Strike_Rate,Four_Wicket_Hauls,Five_Wicket_Hauls
0,2024,Ruturaj Gaikwad,2,0,61,46,30.5,51,119.61,0,...,2,0,0,0,0,0,0,0,0,0
1,2023,Ruturaj Gaikwad,16,1,590,92,42.14,400,147.5,0,...,16,0,0,0,0,0,0,0,0,0
2,2022,Ruturaj Gaikwad,14,0,368,99,26.29,291,126.46,0,...,14,0,0,0,0,0,0,0,0,0
3,2021,Ruturaj Gaikwad,16,2,635,101*,45.35,466,136.26,1,...,16,0,0,0,0,0,0,0,0,0
4,2020,Ruturaj Gaikwad,6,2,204,72,51.0,169,120.71,0,...,6,0,0,0,0,0,0,0,0,0
5,2023,Devon Conway,16,2,672,92*,51.69,481,139.71,0,...,16,0,0,0,0,0,0,0,0,0
6,2022,Devon Conway,7,1,252,87,42.0,173,145.66,0,...,7,0,0,0,0,0,0,0,0,0
7,2024,THala,2,0,0,0,0.0,0,0.0,0,...,2,0,0,0,0,0,0,0,0,0
8,2023,THala,16,8,104,32*,26.0,57,182.46,0,...,16,0,0,0,0,0,0,0,0,0
9,2022,THala,14,6,232,50*,33.14,188,123.4,0,...,14,0,0,0,0,0,0,0,0,0


na_values parameter

In [40]:
ipldata= pd.read_csv("ipl_data.csv",na_values=['2024',"2001"])
ipldata.head(10)

Unnamed: 0,Year,Player_Name,Matches_Batted,Not_Outs,Runs_Scored,Highest_Score,Batting_Average,Balls_Faced,Batting_Strike_Rate,Centuries,...,Matches_Bowled,Balls_Bowled,Runs_Conceded,Wickets_Taken,Best_Bowling_Match,Bowling_Average,Economy_Rate,Bowling_Strike_Rate,Four_Wicket_Hauls,Five_Wicket_Hauls
0,,Ruturaj Gaikwad,2,0,61,46,30.5,51,119.61,0,...,2,0,0,0,0,0,0,0,0,0
1,2023.0,Ruturaj Gaikwad,16,1,590,92,42.14,400,147.5,0,...,16,0,0,0,0,0,0,0,0,0
2,2022.0,Ruturaj Gaikwad,14,0,368,99,26.29,291,126.46,0,...,14,0,0,0,0,0,0,0,0,0
3,2021.0,Ruturaj Gaikwad,16,2,635,101*,45.35,466,136.26,1,...,16,0,0,0,0,0,0,0,0,0
4,2020.0,Ruturaj Gaikwad,6,2,204,72,51.0,169,120.71,0,...,6,0,0,0,0,0,0,0,0,0
5,2023.0,Devon Conway,16,2,672,92*,51.69,481,139.71,0,...,16,0,0,0,0,0,0,0,0,0
6,2022.0,Devon Conway,7,1,252,87,42.0,173,145.66,0,...,7,0,0,0,0,0,0,0,0,0
7,,MS Dhoni,2,0,0,0,0.0,0,0.0,0,...,2,0,0,0,0,0,0,0,0,0
8,2023.0,MS Dhoni,16,8,104,32*,26.0,57,182.46,0,...,16,0,0,0,0,0,0,0,0,0
9,2022.0,MS Dhoni,14,6,232,50*,33.14,188,123.4,0,...,14,0,0,0,0,0,0,0,0,0


Loading a huge dataset in chunks

In [48]:
ipldata= pd.read_csv("ipl_data.csv",chunksize=500)
ipldata

<pandas.io.parsers.readers.TextFileReader at 0x19710dcfe10>

In [49]:
for chunk in ipldata:
    print(chunk.shape)

(500, 25)
(500, 25)
(172, 25)
