In [18]:
# This workbook contains the different methods of creating data frames using pandas library
import pandas as pd
import numpy as np

In [19]:
# 1. Through a dictionary of series, or a dictionary of lists
dict = {
    "name" : ["Alice","Bob","Harry"],
    "marks" : [90,34,56],
    "age" : [34,35,13]
}

pd.DataFrame(dict)

Unnamed: 0,name,marks,age
0,Alice,90,34
1,Bob,34,35
2,Harry,56,13


In [20]:
# 2. Using a list of a list
data = [["Harry", 34], ["Sagar", 45], ["Dhwani",100]]
pd.DataFrame(data, columns=["Name","Marks"])
# pd.DataFrame(data, columns=["Name","Marks","Age"]) -> Incorrect since in the data we have only two entries in a list of 
# a list, in this case we have to provide one more column or remove this entry to resolve the error

Unnamed: 0,Name,Marks
0,Harry,34
1,Sagar,45
2,Dhwani,100


In [21]:
# 3. Using NumPy array
arr = np.array([[1,2],[5,6]])
df = pd.DataFrame(arr, columns=["A","B"])
df

Unnamed: 0,A,B
0,1,2
1,5,6


In [28]:
# Huge data can't be handled by Excel easily.
# Excel can't be used to integrate the data with API's (not as good as Pandas Library)
# 4. Using Excel
df = pd.read_excel("data.xlsx")
df

Unnamed: 0,Name,School,Marks
0,Shubham,SPS,34
1,Joshua,UPS,45
2,Jack,WPS,45
3,Alex,UIS,67
4,Jasmine,RTS,45
5,Rohan,RRY,88
6,Rishi,Luxury,77
7,Akash,BB,90


In [29]:
# 5. Using csv
#CSV has just data (unlike Excel which can have other things like filters)
df = pd.read_csv('data_csv.csv')
df

Unnamed: 0,Name,School,Marks
0,Shubham,SPS,34
1,Joshua,UPS,45
2,Jack,WPS,45
3,Alex,UIS,67
4,Jasmine,RTS,45
5,Rohan,RRY,88
6,Rishi,Luxury,77
7,Akash,BB,90


In [30]:
# 6. Using JSON
df = pd.read_json('data.json')
df

Unnamed: 0,name,lang
0,Sagar,Python
1,Dhwani,Java


In [32]:
# 7. Reading / Creating a data frame using data from URL
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [40]:
# EDS - Exploratory Data analysis, essential step in the Data Analysis project. All the methods
# run below are for EDA. Whenver you get some data, firslty do some EDA on the data.
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [41]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [43]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [44]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [45]:
df.shape

(244, 7)