In [4]:
import numpy as np
import pandas as pd


In [5]:
#Creating a series out of a list
x = pd.Series([10, 20, 30, 40, 50])
x.index=["A", "B", "C", "D", "E"]
x

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [6]:
data = [450, 650, 870]
Sales = pd.Series(data, index=['Don', 'Mike', 'Edwin'])
Sales
#accessing the index
print(Sales.index)

Index(['Don', 'Mike', 'Edwin'], dtype='object')


In [7]:
print(Sales)

Don      450
Mike     650
Edwin    870
dtype: int64


In [8]:
#Acessing values using a positional index
Sales[1]

650

In [9]:
Sales > 800

Don      False
Mike     False
Edwin     True
dtype: bool

In [10]:
#accessing valus using the index name
print(Sales)
"Mikes sales {}".format(Sales['Mike'])

Don      450
Mike     650
Edwin    870
dtype: int64


'Mikes sales 650'

In [11]:
#Converting Series to dictionaries
sales_dict = Sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [13]:
#Converting dictionaries to Pandas Series
sales_ser = pd.Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

In [15]:
new_sales = pd.Series(Sales, index=['Don', 'Mike', 'Sally', 'Edwin', 'Lucy'])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [16]:
#checking if entries are NaN - we can use numpy
np.isnan(new_sales['Sally'])

True

In [17]:
#checking for null values using pandas
pd.isna(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [18]:
new_sales[pd.notna(new_sales)]

Don      450.0
Mike     650.0
Edwin    870.0
dtype: float64

In [19]:
#Pandas DataFrames
#DataFrames are two-dimensional, size-mutable

In [32]:
new_dict = {
    'Name':['Tom','Jane','Steve','Lucy'],
    'Sales':[250,500,350,400],
    'Date': [2022,2020,2021,2022]} 
df = pd.DataFrame(new_dict)
df

Unnamed: 0,Name,Sales,Date
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [33]:
df_index = pd.DataFrame(new_dict, index=['rank1','rank2','rank3','rank4'])
df_index

Unnamed: 0,Name,Sales,Date
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [34]:
df.index.name='Rank'
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [35]:
df_noindex= df.reset_index()
df_noindex
#to make the change permemnant assign to a variable

Unnamed: 0,Rank,Name,Sales,Date
0,0,Tom,250,2022
1,1,Jane,500,2020
2,2,Steve,350,2021
3,3,Lucy,400,2022


In [36]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [37]:
df.columns

Index(['Name', 'Sales', 'Date'], dtype='object')

In [38]:
df.index

RangeIndex(start=0, stop=4, step=1, name='Rank')

In [39]:
type(df.values)

numpy.ndarray

In [40]:
df.values

array([['Tom', 250, 2022],
       ['Jane', 500, 2020],
       ['Steve', 350, 2021],
       ['Lucy', 400, 2022]], dtype=object)

In [50]:
new_dict_v2 = {
    'Name':['Tom','Jane','Steve','Lucy'],
    'Sales':[250,500,350,400],
    'Date': [2022,2020,2021,2022],
    'Rank': ['rank1','rank2','rank3','rank4']}
df2=pd.DataFrame(new_dict_v2)
df2

Unnamed: 0,Name,Sales,Date,Rank
0,Tom,250,2022,rank1
1,Jane,500,2020,rank2
2,Steve,350,2021,rank3
3,Lucy,400,2022,rank4


In [51]:
df2.set_index("Rank")

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [52]:
#multi level indexes(hierchical indexes) we can sort according to indexes
df2.set_index(['Rank', 'Name'],inplace=True)

In [53]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [54]:
df2.reset_index(level=['Name'])

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [60]:
#sort according to name
df2.sort_index(level=["Name", "Rank"], ascending=[False,True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank3,Steve,350,2021
rank4,Lucy,400,2022
rank2,Jane,500,2020


In [61]:
#Subsetting DataFrame. it is important to access columns, rows and single elemensts in your dataframe easily
#square brackets 
#advance methods ===== loc and iloc

In [62]:
df["Name"]

Rank
0      Tom
1     Jane
2    Steve
3     Lucy
Name: Name, dtype: object

In [63]:
type(df['Name'])

pandas.core.series.Series

In [66]:
df[['Name']]

Unnamed: 0_level_0,Name
Rank,Unnamed: 1_level_1
0,Tom
1,Jane
2,Steve
3,Lucy


In [67]:
type(df[['Name']])

pandas.core.frame.DataFrame

In [75]:
#accesing multiple 
df[["Name", 'Sales']]

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Tom,250
1,Jane,500
2,Steve,350
3,Lucy,400


In [76]:
df[1:3]  #always one more postition than the actual one 

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Jane,500,2020
2,Steve,350,2021


In [78]:
df[(df['Sales']>300) & (df['Date']>2020)]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Steve,350,2021
3,Lucy,400,2022


In [80]:
df.Date

Rank
0    2022
1    2020
2    2021
3    2022
Name: Date, dtype: int64

In [81]:
df.Date.isin([2020,2022])

Rank
0     True
1     True
2    False
3     True
Name: Date, dtype: bool

In [83]:
df[df['Date'].isin([2020,2022])]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tom,250,2022
1,Jane,500,2020
3,Lucy,400,2022


In [86]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tom,250,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


In [93]:
#loc[row_label, column_label]
#iloc[row_position, coloumn_position]

In [94]:
df.loc[1::,['Name', 'Sales']]

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Jane,500
2,Steve,350
3,Lucy,400


In [95]:
df.loc

<pandas.core.indexing._LocIndexer at 0x1f7ff49bc90>

In [102]:
#summary statistcs and groupby

In [103]:
df = pd.read_csv('gapminder.csv')

In [104]:
df

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
0,11,Afghanistan,2007,31889923,Asia,43.828,974.580338
1,23,Albania,2007,3600523,Europe,76.423,5937.029526
2,35,Algeria,2007,33333216,Africa,72.301,6223.367465
3,47,Angola,2007,12420476,Africa,42.731,4797.231267
4,59,Argentina,2007,40301927,Americas,75.320,12779.379640
...,...,...,...,...,...,...,...
137,1655,Vietnam,2007,85262356,Asia,74.249,2441.576404
138,1667,West Bank and Gaza,2007,4018332,Asia,73.422,3025.349798
139,1679,"Yemen, Rep.",2007,22211743,Asia,62.698,2280.769906
140,1691,Zambia,2007,11746035,Africa,42.384,1271.211593


In [101]:
df['population'].max()

1318683096

In [113]:
df.groupby('cont')[['gdp_cap', 'population']].sum()

Unnamed: 0_level_0,gdp_cap,population
cont,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,160629.695446,929539692
Americas,275075.790634,898871184
Asia,411609.886714,3811953827
Europe,751634.449078,586098529
Oceania,59620.37655,24549947


In [117]:
df['cont'].value_counts(sort=True)   #normalize tells you percentage of 

Africa      52
Asia        33
Europe      30
Americas    25
Oceania      2
Name: cont, dtype: int64

In [175]:
#Assignment:   
# Pre-defined lists
#country = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
#drives_right =  [True, False, False, False, True, True, True]
#cars_per_cap = [809, 731, 588, 18, 200, 70, 45]
#row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

#step1: Create dictionary my_dict with three key:value pairs: my_dict
 
#step2 Build a DataFrame cars from my_dict: cars
 
#step3 print cars
 
#step4 specify the row labels of cars
 
#step5 print cars again
 
#step6 Print out country column as Pandas Series
 
#step7 Print out country column as Pandas DataFrame
 
#step8 Print out DataFrame with country and drives_right columns
 
#step9 Print out first 3 observations
 
#step10 Print out fourth, fifth and sixth observation
 
#step11 Print out observation for Japan
 
#step12 Print out observations for Australia and Egypt
 
#step13 Print out drives_right value of Morocco


new_dict = {
    'country':['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt'],
    'drives_right':[True, False, False, False, True, True, True],
    'cars_per_cap': [809, 731, 588, 18, 200, 70, 45]}
cars = pd.DataFrame(new_dict)
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45


In [176]:
cars.index.name="row_labels"

In [177]:
cars.country

row_labels
0    United States
1        Australia
2            Japan
3            India
4           Russia
5          Morocco
6            Egypt
Name: country, dtype: object

In [178]:
type(cars.country)



pandas.core.series.Series

In [179]:
type(cars[['country']])

pandas.core.frame.DataFrame

In [181]:
cars[['country', 'drives_right']]

Unnamed: 0_level_0,country,drives_right
row_labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,United States,True
1,Australia,False
2,Japan,False
3,India,False
4,Russia,True
5,Morocco,True
6,Egypt,True


In [182]:
cars[0:3]

Unnamed: 0_level_0,country,drives_right,cars_per_cap
row_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,United States,True,809
1,Australia,False,731
2,Japan,False,588


In [183]:
cars[3
:6]

Unnamed: 0_level_0,country,drives_right,cars_per_cap
row_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,India,False,18
4,Russia,True,200
5,Morocco,True,70


In [184]:
cars.loc[2]

country         Japan
drives_right    False
cars_per_cap      588
Name: 2, dtype: object

In [185]:
cars[cars.country.isin(['Australia', 'Egypt'])]

Unnamed: 0_level_0,country,drives_right,cars_per_cap
row_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Australia,False,731
6,Egypt,True,45


In [186]:
cars.loc[cars.country== 'Morocco', ['drives_right']]

Unnamed: 0_level_0,drives_right
row_labels,Unnamed: 1_level_1
5,True


In [187]:
##############to see the size of data first thing to do 
cars.shape

(7, 3)

In [188]:
df.shape

(7, 3)

In [190]:
# to see if there are null values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       7 non-null      object
 1   drives_right  7 non-null      bool  
 2   cars_per_cap  7 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 247.0+ bytes


In [192]:
df.describe() #statitics for numeric data

Unnamed: 0,cars_per_cap
count,7.0
mean,351.571429
std,345.595552
min,18.0
25%,57.5
50%,200.0
75%,659.5
max,809.0


In [195]:
# If we have series we want to put into a DataFrame, we can easily combine them together
# If we wanted a DataFrame from a single series, we can do that by passing in the single series 
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
df_region = pd.DataFrame({'East':east,'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


In [196]:
# Once we have a DataFrame, we can easily add Series on
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [200]:
#if we made a mistake and need to set a new index, we can add a new column and set that new colomn 
#as the index

years = ['2016','2017','2018','2019']
df_region['years'] = years
df_region


Unnamed: 0_level_0,East,West,North,South,years
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,1000.0,1100,2000,1500,2016
2017,1200.0,1300,3000,2000,2017
2018,3400.0,2400,2500,1500,2018
2019,,3500,4000,4000,2019


In [201]:
# We can use set_index to set the index to a different column in the DataFrame
df_region = df_region.set_index('years')
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [202]:
# Let's say we want to see different index values, we can use reindex
# reindex will shift our index
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,1200.0,1300.0,3000.0,2000.0
2018,3400.0,2400.0,2500.0,1500.0
2019,,3500.0,4000.0,4000.0
2020,,,,
2021,,,,


In [203]:
# reindex can also be used on columns
# We can shift our columns, or add new ones if we add a name that was not present before
re_indexed = new_df.reindex(columns=['North','East','South','New'])
re_indexed



Unnamed: 0_level_0,North,East,South,New
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,3000.0,1200.0,2000.0,
2018,2500.0,3400.0,1500.0,
2019,4000.0,,4000.0,
2020,,,,
2021,,,,
