# Loading a CSV file into a dataframe
* CSV files are the Comma Separated Files. It allows users to load tabular data into a DataFrame, which is a powerful structure for data manipulation and analysis. To access data from the CSV file, we require a function read_csv() from Pandas that retrieves data in the form of the data frame.



In [2]:
import pandas as pd

df = pd.read_csv("Airline Dataset Updated - v2.csv")
df

Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,6/28/2022,CXF,Fransisco Hazeldine,On Time
1,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,12/26/2022,YCO,Marla Parsonage,On Time
2,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,1/18/2022,GNB,Rhonda Amber,On Time
3,BRS38V,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,9/16/2022,YND,Kacie Commucci,Delayed
4,9kvTLo,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,2/25/2022,SEE,Ebonee Tree,On Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98614,hnGQ62,Gareth,Mugford,Male,85,China,Hasvik Airport,NO,Norway,EU,Europe,12-11-2022,HAA,Pammie Kingscote,Cancelled
98615,2omEzh,Kasey,Benedict,Female,19,Russia,Ampampamena Airport,MG,Madagascar,AF,Africa,10/30/2022,IVA,Dorice Lochran,Cancelled
98616,VUPiVG,Darrin,Lucken,Male,65,Indonesia,Albacete-Los Llanos Airport,ES,Spain,EU,Europe,09-10-2022,ABC,Gearalt Main,On Time
98617,E47NtS,Gayle,Lievesley,Female,34,China,Gagnoa Airport,CI,Côte d'Ivoire,AF,Africa,10/26/2022,GGN,Judon Chasle,Cancelled


In [3]:
df.columns

Index(['Passenger ID', 'First Name', 'Last Name', 'Gender', 'Age',
       'Nationality', 'Airport Name', 'Airport Country Code', 'Country Name',
       'Airport Continent', 'Continents', 'Departure Date', 'Arrival Airport',
       'Pilot Name', 'Flight Status'],
      dtype='object')

In [4]:
# Read specific columns using read_csv

df = pd.read_csv("Airline Dataset Updated - v2.csv", usecols = ["Passenger ID", "Airport Name", "Arrival Airport"])
df
                 

Unnamed: 0,Passenger ID,Airport Name,Arrival Airport
0,ABVWIg,Coldfoot Airport,CXF
1,jkXXAX,Kugluktuk Airport,YCO
2,CdUz2g,Grenoble-Isère Airport,GNB
3,BRS38V,Ottawa / Gatineau Airport,YND
4,9kvTLo,Gillespie Field,SEE
...,...,...,...
98614,hnGQ62,Hasvik Airport,HAA
98615,2omEzh,Ampampamena Airport,IVA
98616,VUPiVG,Albacete-Los Llanos Airport,ABC
98617,E47NtS,Gagnoa Airport,GGN


In [5]:
## Setting an index column
df = pd.read_csv("Airline Dataset Updated - v2.csv", index_col = "Departure Date")
df

Unnamed: 0_level_0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Arrival Airport,Pilot Name,Flight Status
Departure Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6/28/2022,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,CXF,Fransisco Hazeldine,On Time
12/26/2022,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,YCO,Marla Parsonage,On Time
1/18/2022,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,GNB,Rhonda Amber,On Time
9/16/2022,BRS38V,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,YND,Kacie Commucci,Delayed
2/25/2022,9kvTLo,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,SEE,Ebonee Tree,On Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12-11-2022,hnGQ62,Gareth,Mugford,Male,85,China,Hasvik Airport,NO,Norway,EU,Europe,HAA,Pammie Kingscote,Cancelled
10/30/2022,2omEzh,Kasey,Benedict,Female,19,Russia,Ampampamena Airport,MG,Madagascar,AF,Africa,IVA,Dorice Lochran,Cancelled
09-10-2022,VUPiVG,Darrin,Lucken,Male,65,Indonesia,Albacete-Los Llanos Airport,ES,Spain,EU,Europe,ABC,Gearalt Main,On Time
10/26/2022,E47NtS,Gayle,Lievesley,Female,34,China,Gagnoa Airport,CI,Côte d'Ivoire,AF,Africa,GGN,Judon Chasle,Cancelled


In [6]:
# Handling missing values
df = pd.read_csv("Airline Dataset Updated - v2.csv", na_values=['N/A', 'Unknown'])
df

Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,6/28/2022,CXF,Fransisco Hazeldine,On Time
1,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,12/26/2022,YCO,Marla Parsonage,On Time
2,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,1/18/2022,GNB,Rhonda Amber,On Time
3,BRS38V,Dominica,Pyle,Female,71,China,Ottawa / Gatineau Airport,CA,Canada,NAM,North America,9/16/2022,YND,Kacie Commucci,Delayed
4,9kvTLo,Bay,Pencost,Male,21,China,Gillespie Field,US,United States,NAM,North America,2/25/2022,SEE,Ebonee Tree,On Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98614,hnGQ62,Gareth,Mugford,Male,85,China,Hasvik Airport,NO,Norway,EU,Europe,12-11-2022,HAA,Pammie Kingscote,Cancelled
98615,2omEzh,Kasey,Benedict,Female,19,Russia,Ampampamena Airport,MG,Madagascar,AF,Africa,10/30/2022,IVA,Dorice Lochran,Cancelled
98616,VUPiVG,Darrin,Lucken,Male,65,Indonesia,Albacete-Los Llanos Airport,ES,Spain,EU,Europe,09-10-2022,ABC,Gearalt Main,On Time
98617,E47NtS,Gayle,Lievesley,Female,34,China,Gagnoa Airport,CI,Côte d'Ivoire,AF,Africa,10/26/2022,GGN,Judon Chasle,Cancelled


In [7]:
## Reading csv files with delimiters 

file_path = "Airline Dataset Updated - v2.csv"
# Try reading with different delimiters
try:
    # Read with comma (default delimiter), skipping bad lines
    df_comma = pd.read_csv(file_path, delimiter=',', on_bad_lines='skip')
    print("Data read using comma delimiter:")
    print(df_comma.head())

except pd.errors.EmptyDataError:
    print("Error reading with comma delimiter.")

except pd.errors.ParserError as e:
    print(f"ParserError when using comma delimiter: {e}")

try:
    # Read with tab delimiter, skipping bad lines
    df_tab = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')
    print("\nData read using tab delimiter:")
    print(df_tab.head())

except pd.errors.EmptyDataError:
    print("Error reading with tab delimiter.")

except pd.errors.ParserError as e:
    print(f"ParserError when using tab delimiter: {e}")

try:
    # Read with semicolon delimiter, skipping bad lines
    df_semicolon = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')
    print("\nData read using semicolon delimiter:")
    print(df_semicolon.head())

except pd.errors.EmptyDataError:
    print("Error reading with semicolon delimiter.")

except pd.errors.ParserError as e:
    print(f"ParserError when using semicolon delimiter: {e}")

try:
    # Read with pipe delimiter, skipping bad lines
    df_pipe = pd.read_csv(file_path, delimiter='|', on_bad_lines='skip')
    print("\nData read using pipe delimiter:")
    print(df_pipe.head())

except pd.errors.EmptyDataError:
    print("Error reading with pipe delimiter.")

except pd.errors.ParserError as e:
    print(f"ParserError when using pipe delimiter: {e}")

try:
    # Read with space delimiter, skipping bad lines
    df_space = pd.read_csv(file_path, delimiter='\s+', on_bad_lines='skip')
    print("\nData read using space delimiter:")
    print(df_space.head())

except pd.errors.EmptyDataError:
    print("Error reading with space delimiter.")

except pd.errors.ParserError as e:
    print(f"ParserError when using space delimiter: {e}")

try:
    # Read with custom delimiter (example: #), skipping bad lines
    df_custom = pd.read_csv(file_path, delimiter='#', on_bad_lines='skip')
    print("\nData read using custom delimiter (#):")
    print(df_custom.head())

except pd.errors.EmptyDataError:
    print("Error reading with custom delimiter (#).")

except pd.errors.ParserError as e:
    print(f"ParserError when using custom delimiter (#): {e}")


  df_space = pd.read_csv(file_path, delimiter='\s+', on_bad_lines='skip')


Data read using comma delimiter:
  Passenger ID First Name Last Name  Gender  Age Nationality  \
0       ABVWIg     Edithe    Leggis  Female   62       Japan   
1       jkXXAX     Elwood      Catt    Male   62   Nicaragua   
2       CdUz2g      Darby   Felgate    Male   67      Russia   
3       BRS38V   Dominica      Pyle  Female   71       China   
4       9kvTLo        Bay   Pencost    Male   21       China   

                Airport Name Airport Country Code   Country Name  \
0           Coldfoot Airport                   US  United States   
1          Kugluktuk Airport                   CA         Canada   
2     Grenoble-Isère Airport                   FR         France   
3  Ottawa / Gatineau Airport                   CA         Canada   
4            Gillespie Field                   US  United States   

  Airport Continent     Continents Departure Date Arrival Airport  \
0               NAM  North America      6/28/2022             CXF   
1               NAM  North America 

In [8]:
# Using nrows in read_csv() - The nrows parameter limits the number of rows read from a file, enabling quick previews or partial data loading for large datasets. 

df = pd.read_csv('Airline Dataset Updated - v2.csv', nrows = 3)
df


Unnamed: 0,Passenger ID,First Name,Last Name,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Pilot Name,Flight Status
0,ABVWIg,Edithe,Leggis,Female,62,Japan,Coldfoot Airport,US,United States,NAM,North America,6/28/2022,CXF,Fransisco Hazeldine,On Time
1,jkXXAX,Elwood,Catt,Male,62,Nicaragua,Kugluktuk Airport,CA,Canada,NAM,North America,12/26/2022,YCO,Marla Parsonage,On Time
2,CdUz2g,Darby,Felgate,Male,67,Russia,Grenoble-Isère Airport,FR,France,EU,Europe,1/18/2022,GNB,Rhonda Amber,On Time


In [9]:
# using skiprows in read_csv() - The skiprows parameter skips unnecessary rows at the start of a file, which is useful for ignoring metadata or extra headers that are not part of the dataset.

df= pd.read_csv("Airline Dataset Updated - v2.csv")
print("Previous Dataset: ")
print(df)
# using skiprows
df = pd.read_csv("Airline Dataset Updated - v2.csv", skiprows = [4,5])
print("Dataset After skipping rows: ")
print(df)


Previous Dataset: 
      Passenger ID  First Name  Last Name  Gender  Age Nationality  \
0           ABVWIg      Edithe     Leggis  Female   62       Japan   
1           jkXXAX      Elwood       Catt    Male   62   Nicaragua   
2           CdUz2g       Darby    Felgate    Male   67      Russia   
3           BRS38V    Dominica       Pyle  Female   71       China   
4           9kvTLo         Bay    Pencost    Male   21       China   
...            ...         ...        ...     ...  ...         ...   
98614       hnGQ62      Gareth    Mugford    Male   85       China   
98615       2omEzh       Kasey   Benedict  Female   19      Russia   
98616       VUPiVG      Darrin     Lucken    Male   65   Indonesia   
98617       E47NtS       Gayle  Lievesley  Female   34       China   
98618       8JYEcz  Wilhelmine     Touret  Female   10      Poland   

                      Airport Name Airport Country Code        Country Name  \
0                 Coldfoot Airport                   US      

In [10]:
# parse dates - converts date columns into datetime objects, simplyfying operations like filtering, sorting, or time-based analysis.

df = pd.read_csv("Airline Dataset Updated - v2.csv", parse_dates=["Departure Date"])