# Introduction to pandas

Pandas is a Python library used for working with data sets. It has functions for analyzing, cleaning, exploring, and manipulating data.

In [59]:
import pandas as pd
import numpy as np

In [60]:
print(pd.__version__)
print(np.__version__)

1.1.3
1.19.2


In [61]:
data = {
    'roll_no' : [1,2,3,4,5],
    'ppr_id' : [1,3,5,7,8],
    'marks' : [34,65,23,25,87]
}

In [62]:
data

{'roll_no': [1, 2, 3, 4, 5],
 'ppr_id': [1, 3, 5, 7, 8],
 'marks': [34, 65, 23, 25, 87]}

In [63]:
df =pd.DataFrame(data)
df

Unnamed: 0,roll_no,ppr_id,marks
0,1,1,34
1,2,3,65
2,3,5,23
3,4,7,25
4,5,8,87


# Data Structure
The two main data structures in Pandas are Series for 1-D data, DataFrame for 2-D data and pannel for 3-D Data.

# Series
It is a one-dimensional array holding data of any type.

In [64]:
a = [1, 7, 3]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    3
dtype: int64


In [65]:
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

x    1
y    7
z    3
dtype: int64


In [66]:
print(myvar["y"])

7


In [67]:
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)

day1    420
day2    380
day3    390
dtype: int64


# DataFrame
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [68]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data)
print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


# Pannel
A panel is a 3D container of data. 
1. From ndarrays
2. From dict of DataFrames

In [69]:
p = pd.Panel()
print(p)

<pandas.__getattr__.<locals>.Panel object at 0x104B3820>


  p = pd.Panel()


# Creating Pandas dataFrame

Two different methods to create Pandas DataFrame:
1. By typing the values in Python itself to create the DataFrame
2. By importing the values from a file (such as an Excel file), and then creating the DataFrame in Python based on the values imported

In [70]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}
df = pd.DataFrame(data)

print(df)

   calories  duration
0       420        50
1       380        40
2       390        45


In [74]:
data = pd.read_csv("IMDB-Movie-Data.csv")
data

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0


In [75]:
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [78]:
data.tail()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,,45.0
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.54,46.0
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.01,50.0
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,,22.0
999,1000,Nine Lives,"Comedy,Family,Fantasy",A stuffy businessman finds himself trapped ins...,Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,12435,19.64,11.0


In [79]:
data.shape

(1000, 12)

In [80]:
data.size

12000

In [81]:
data.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


In [85]:
data.dtypes

Rank                    int64
Title                  object
Genre                  object
Description            object
Director               object
Actors                 object
Year                    int64
Runtime (Minutes)       int64
Rating                float64
Votes                   int64
Revenue (Millions)    float64
Metascore             float64
dtype: object

In [86]:
data.ndim

2

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 74.3+ KB


# From a API

In [118]:
import requests
import json

In [122]:
url = 'https://api.covid19api.com/summary'
r = requests.get(url)
r

<Response [200]>

In [133]:
json=r.json()
json

{'Message': 'Caching in progress',
 'Global': {'NewConfirmed': 0,
  'TotalConfirmed': 0,
  'NewDeaths': 0,
  'TotalDeaths': 0,
  'NewRecovered': 0,
  'TotalRecovered': 0,
  'Date': '0001-01-01T00:00:00Z'},
 'Date': '0001-01-01T00:00:00Z'}

In [135]:
json.keys()

dict_keys(['Message', 'Global', 'Date'])

In [139]:
json['Global']

{'NewConfirmed': 0,
 'TotalConfirmed': 0,
 'NewDeaths': 0,
 'TotalDeaths': 0,
 'NewRecovered': 0,
 'TotalRecovered': 0,
 'Date': '0001-01-01T00:00:00Z'}

In [142]:
json['Date']

'0001-01-01T00:00:00Z'

In [145]:
type(json['Global'])

dict

In [146]:
type(json['Date'])

str

In [147]:
type(json['Message'])

str

# Dataframe Object

In [156]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [164]:
population = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
states = pd.DataFrame({'population': population,'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [165]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [166]:
states.columns

Index(['population', 'area'], dtype='object')

In [167]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [175]:
states.describe()

Unnamed: 0,population,area
count,5.0,5.0
mean,23373370.0,316246.6
std,9640386.0,242437.411951
min,12882140.0,141297.0
25%,19552860.0,149995.0
50%,19651130.0,170312.0
75%,26448190.0,423967.0
max,38332520.0,695662.0


In [181]:
states.append(data)

Unnamed: 0,0,1,2,3,area,population
California,,,,,423967.0,38332521.0
Texas,,,,,695662.0,26448193.0
New York,,,,,141297.0,19651127.0
Florida,,,,,170312.0,19552860.0
Illinois,,,,,149995.0,12882135.0
0,85.0,60.0,90.0,95.0,,
1,73.0,80.0,64.0,87.0,,
2,98.0,58.0,74.0,92.0,,


In [182]:
states.isnull()

Unnamed: 0,population,area
California,False,False
Texas,False,False
New York,False,False
Florida,False,False
Illinois,False,False
