# Pass URL to pd.read_csv Method

In [3]:
import pandas as pd
url = "https://data.cityofnewyork.us/api/views/25th-nujf/rows.csv"
names = pd.read_csv(url)
names.head() 

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [4]:
names["Child's First Name"] # This is a series
names["Child's First Name"].to_frame() # converted to a python Df

Unnamed: 0,Child's First Name
0,GERALDINE
1,GIA
2,GIANNA
3,GISELLE
4,GRACE
...,...
49504,Nala
49505,Royalty
49506,Sariyah
49507,Simone


In [5]:
# Converting Pandas Df to a Python object like list, dictionary etc
names["Child's First Name"].tolist()

names["Child's First Name"].to_dict() # Ensure that index in Df are unique else python will take last Index as Key in dictionary 


{0: 'GERALDINE',
 1: 'GIA',
 2: 'GIANNA',
 3: 'GISELLE',
 4: 'GRACE',
 5: 'GUADALUPE',
 6: 'HAILEY',
 7: 'HALEY',
 8: 'HANNAH',
 9: 'HAYLEE',
 10: 'HAYLEY',
 11: 'HAZEL',
 12: 'HEAVEN',
 13: 'HEIDI',
 14: 'HEIDY',
 15: 'HELEN',
 16: 'IMANI',
 17: 'INGRID',
 18: 'IRENE',
 19: 'IRIS',
 20: 'ISABEL',
 21: 'ISABELA',
 22: 'ISABELLA',
 23: 'ISABELLE',
 24: 'ISIS',
 25: 'ITZEL',
 26: 'IZABELLA',
 27: 'JACQUELINE',
 28: 'JADA',
 29: 'JADE',
 30: 'JAELYNN',
 31: 'JAMIE',
 32: 'JANELLE',
 33: 'JASLENE',
 34: 'JASMIN',
 35: 'JASMINE',
 36: 'JAYDA',
 37: 'JAYLA',
 38: 'JAYLAH',
 39: 'JAYLEEN',
 40: 'JAYLENE',
 41: 'JAYLIN',
 42: 'JAYLYN',
 43: 'JAZLYN',
 44: 'JAZMIN',
 45: 'JAZMINE',
 46: 'JENNIFER',
 47: 'JESSICA',
 48: 'JIMENA',
 49: 'JOCELYN',
 50: 'JOHANNA',
 51: 'JOSELYN',
 52: 'JULIA',
 53: 'JULIANA',
 54: 'JULIANNA',
 55: 'JULIET',
 56: 'JULIETTE',
 57: 'JULISSA',
 58: 'KAELYN',
 59: 'KAILEY',
 60: 'KAILYN',
 61: 'KAITLYN',
 62: 'KAMILA',
 63: 'KAREN',
 64: 'KARLA',
 65: 'KATE',
 66: 'KATE

In [6]:
# Now task is to :
#1 Remove duplicates
#2 Sort in alphabetical order
#3 Capitalise the first letter

", ".join(names["Child's First Name"].str.title().drop_duplicates().sort_values())

"Aahil, Aaliyah, Aarav, Aaron, Aarya, Aaryan, Aayan, Aayat, Abby, Abdiel, Abdoul, Abdoulaye, Abdul, Abdullah, Abdulloh, Abe, Abel, Abigail, Aboubacar, Abraham, Abrar, Abrielle, Abril, Abubakr, Ace, Ada, Adalynn, Adam, Adan, Addison, Adelaide, Adele, Adelina, Adeline, Adelyn, Adelynn, Aden, Adiel, Adina, Aditya, Adonis, Adrian, Adriana, Adrianna, Adriel, Adyan, Ahad, Aharon, Ahmad, Ahmed, Ahnaf, Ahron, Ahuva, Aicha, Aidan, Aiden, Ailani, Aileen, Aimee, Aisha, Aissata, Aissatou, Aitana, Aiza, Aizah, Akiva, Alahia, Alaia, Alaina, Alan, Alana, Alani, Alanis, Alanna, Alayna, Alba, Albert, Alberto, Aldo, Aleah, Alec, Aleena, Alejandra, Alejandro, Aleksander, Aleksandra, Alessandra, Alessandro, Alessia, Alex, Alexa, Alexander, Alexandra, Alexandria, Alexia, Alexis, Alfred, Alfredo, Ali, Alia, Aliah, Alice, Alicia, Alijah, Alina, Alisa, Alisha, Alison, Alissa, Alisson, Aliya, Aliyah, Aliza, Allan, Allen, Allison, Allyson, Alma, Alondra, Alonso, Alpha, Alston, Alter, Alvin, Alyson, Alyssa, Amad

# Exporting Data to CSV file

In [7]:
# Exporting selected columns to csv, encoding parameter to take care that no error occurs
names.to_csv("~/Pandas/Data Set/Baby_Names.csv", index = False, columns=["Gender", "Ethnicity", "Child's First Name"], encoding = "utf-8")

# I/O from Excel file

In [8]:
import pandas as pd
df = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Single Worksheet.xlsx")
df.head()

Unnamed: 0,First Name,Last Name,City,Gender
0,Brandon,James,Miami,M
1,Sean,Hawkins,Denver,M
2,Judy,Day,Los Angeles,F
3,Ashley,Ruiz,San Francisco,F
4,Stephanie,Gomez,Portland,F


In [10]:
df_1 = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx")
df_1.head()
# By default it opens first sheet as sheet_name parameter is set = 0


Unnamed: 0,First Name,Last Name,City,Gender
0,Brandon,James,Miami,M
1,Sean,Hawkins,Denver,M
2,Judy,Day,Los Angeles,F
3,Ashley,Ruiz,San Francisco,F
4,Stephanie,Gomez,Portland,F


In [11]:
# To open by : sheet index position or sheet name 
df_1 = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx", sheet_name=1)

# This is same as giving sheet name
df_1 = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx", sheet_name="Data 2")
df_1.head()

Unnamed: 0,First Name,Last Name,City,Gender
0,Parker,Power,Raleigh,F
1,Preston,Prescott,Philadelphia,F
2,Ronaldo,Donaldo,Bangor,M
3,Megan,Stiller,San Francisco,M
4,Bustin,Jieber,Austin,F


In [14]:
data = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx", sheet_name=[0,1])

data = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx", sheet_name=["Data 1","Data 2"])

df

# On opening multiple sheets, Python stores them in a dictionary
# So df.head() will not work as it is a dictionary

{0:   First Name Last Name           City Gender
 0    Brandon     James          Miami      M
 1       Sean   Hawkins         Denver      M
 2       Judy       Day    Los Angeles      F
 3     Ashley      Ruiz  San Francisco      F
 4  Stephanie     Gomez       Portland      F,
 1:   First Name Last Name           City Gender
 0     Parker     Power        Raleigh      F
 1    Preston  Prescott   Philadelphia      F
 2    Ronaldo   Donaldo         Bangor      M
 3      Megan   Stiller  San Francisco      M
 4     Bustin    Jieber         Austin      F}

In [15]:
data["Data 1"]

Unnamed: 0,First Name,Last Name,City,Gender
0,Brandon,James,Miami,M
1,Sean,Hawkins,Denver,M
2,Judy,Day,Los Angeles,F
3,Ashley,Ruiz,San Francisco,F
4,Stephanie,Gomez,Portland,F


In [17]:
# To open all sheets without having to type all names individually
# Set sheet_name = None

data = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Multiple Worksheets.xlsx", sheet_name=None)
data


{'Data 1':   First Name Last Name           City Gender
 0    Brandon     James          Miami      M
 1       Sean   Hawkins         Denver      M
 2       Judy       Day    Los Angeles      F
 3     Ashley      Ruiz  San Francisco      F
 4  Stephanie     Gomez       Portland      F,
 'Data 2':   First Name Last Name           City Gender
 0     Parker     Power        Raleigh      F
 1    Preston  Prescott   Philadelphia      F
 2    Ronaldo   Donaldo         Bangor      M
 3      Megan   Stiller  San Francisco      M
 4     Bustin    Jieber         Austin      F}

In [18]:
df = pd.read_excel("/home/picassa240/Pandas/Data Set/Data - Single Worksheet.xlsx")
df.head()

Unnamed: 0,First Name,Last Name,City,Gender
0,Brandon,James,Miami,M
1,Sean,Hawkins,Denver,M
2,Judy,Day,Los Angeles,F
3,Ashley,Ruiz,San Francisco,F
4,Stephanie,Gomez,Portland,F


In [19]:
girls = names[names["Gender"] == "FEMALE"]
boys = names[names["Gender"] == "MALE"]


In [21]:
girls
boys

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
363,2013,MALE,HISPANIC,Jared,25,80
416,2013,MALE,HISPANIC,Jariel,25,80
547,2011,MALE,ASIAN AND PACIFIC ISLANDER,AARAV,15,51
548,2011,MALE,ASIAN AND PACIFIC ISLANDER,AARON,51,19
549,2011,MALE,ASIAN AND PACIFIC ISLANDER,ABDUL,20,46
...,...,...,...,...,...,...
49336,2018,MALE,BLACK NON HISPANIC,Myles,10,51
49337,2018,MALE,BLACK NON HISPANIC,Quincy,10,51
49338,2018,MALE,BLACK NON HISPANIC,Sage,10,51
49339,2018,MALE,BLACK NON HISPANIC,Sean,10,51


In [23]:
excel_file = pd.ExcelWriter("~/Pandas/Data Set/Baby_Names.xlsx")
girls.to_excel(excel_file, sheet_name="Girls", index=False)
boys.to_excel(excel_file, sheet_name="Boys", index=False, columns=["Year of Birth","Gender","Ethnicity"])
excel_file.save()

  excel_file.save()
