In [2]:
import pandas as pd
import numpy as np

In [3]:
#A series is a 1d labelled array.
#Create series using pd.Series(data, index)

series = pd.Series(data = [1, 2, 3], index = ['A', 'B', 'C'])
print(series)
print(type(series))

series2 = pd.Series([1, 2, 3], ['A', 'B', 'C'])
print(series2)
print(type(series2))

A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>
A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>


In [4]:
#Now we can access array element using 2 different ways:
print(f"Acessing using series[0]: {series[0]}")
print(f"Accessing using series['A']: {series['A']}")

Acessing using series[0]: 1
Accessing using series['A']: 1


In [5]:
#Data in the series can be passed as either:
# A python list
# A numpy array
# A python dictionary

#Python list
series5 = pd.Series(data = [1, 2, 3], index = ['A', 'B', 'C'])
print(series5)

#Numpy array
print("From Numpy array")
series4 = pd.Series(data = np.array([1, 2, 3]), index = ['A', 'B', 'C'])
print(series4)

#Python Dictionary
print("From Python Dictionary")
dict_ = {'A' : 1, 'B' : 2, 'C' : 3}
print(type(dict_))

series3 = pd.Series(dict_)
print(series3)
print(type(series3))

A    1
B    2
C    3
dtype: int64
From Numpy array
A    1
B    2
C    3
dtype: int32
From Python Dictionary
<class 'dict'>
A    1
B    2
C    3
dtype: int64
<class 'pandas.core.series.Series'>


In [6]:
#If there are no label, series will default to [0, n) where n is the number of data values
series6 = pd.Series(data = [1, 2, 3])
print(series6)

0    1
1    2
2    3
dtype: int64


In [7]:
#We can perform operation on series (done base on index)

week_one = pd.Series(data=[100, 50, 300], index=['Bob', 'Sally', 'Jess'])
print(week_one)


week_two = pd.Series(data=[500, 30, 20], index=['Bob', 'Sally', 'Jess'])
print(week_two)

total = week_one + week_two
print(total)

Bob      100
Sally     50
Jess     300
dtype: int64
Bob      500
Sally     30
Jess      20
dtype: int64
Bob      600
Sally     80
Jess     320
dtype: int64


In [8]:
week_one = pd.Series(data=[100, 50, 300], index=['Bob', 'Peter', 'Jess'])
print(week_one)


week_two = pd.Series(data=[500, 30, 20], index=['Bob', 'Sally', 'Jess'])
print(week_two)

#Can perform operation if both don't have exactly the same column label but will result in NaN for the missing column 
total2 = week_one + week_two
print(total2)

Bob      100
Peter     50
Jess     300
dtype: int64
Bob      500
Sally     30
Jess      20
dtype: int64
Bob      600.0
Jess     320.0
Peter      NaN
Sally      NaN
dtype: float64


In [9]:
#Data Frame is a 2d labeled data structure with columns of potentially differnt type. Essentially a spreadsheet
# pd.DataFrame(data, index, columns) columns are an additional way to access individual series/column

df = pd.DataFrame(data = np.arange(0, 20).reshape(4, 5), index = ['A', 'B', 'C', 'D'], columns = ['col1', 'col2', 'col3', 'col4', 'col5'])
print(df)
print(type(df))

   col1  col2  col3  col4  col5
A     0     1     2     3     4
B     5     6     7     8     9
C    10    11    12    13    14
D    15    16    17    18    19
<class 'pandas.core.frame.DataFrame'>


In [10]:
print(df['col3'])
print(type(df['col3']))

A     2
B     7
C    12
D    17
Name: col3, dtype: int32
<class 'pandas.core.series.Series'>


In [11]:
#Same as before default for both series and column will be [0, n)
#P.S Data Frame is visually better without print statement in Jupyter Notebook
my_df = pd.DataFrame(data=np.arange(0,20).reshape(4,5))
my_df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [12]:
my_df = pd.DataFrame(data=np.arange(0,20).reshape(4,5), index=['A', 'B', 'C', 'D'], columns=['col1', 'col2', 'col3', 'col4', 'col5'])
#retrieve list of columns
my_df[['col2', 'col3']]

Unnamed: 0,col2,col3
A,1,2
B,6,7
C,11,12
D,16,17


In [13]:
#To access row information: specify the index location using .iloc, or the index name/label using .loc
my_df.iloc[0]

col1    0
col2    1
col3    2
col4    3
col5    4
Name: A, dtype: int32

In [14]:
my_df.loc['A']

col1    0
col2    1
col3    2
col4    3
col5    4
Name: A, dtype: int32

In [15]:
#look up bunch of rows and columns
my_df.loc['B':'D', 'col1':'col3']

Unnamed: 0,col1,col2,col3
B,5,6,7
C,10,11,12
D,15,16,17


In [16]:
#Since we built the Data Frame from Numpy arrays, we can use the same technique of selecting elements based off some condtions
my_df % 2 == 0

Unnamed: 0,col1,col2,col3,col4,col5
A,True,False,True,False,True
B,False,True,False,True,False
C,True,False,True,False,True
D,False,True,False,True,False


In [17]:
#Select the values that meet the conditions. The one that does not will be replaced with NaN
my_df[my_df % 2 == 0]

Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,,2.0,,4.0
B,,6.0,,8.0,
C,10.0,,12.0,,14.0
D,,16.0,,18.0,


In [18]:
#To fill in NaN values use fillna(value)
my_df[my_df & 2 == 0].fillna(value = 0)

Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,1.0,0.0,0.0,4.0
B,5.0,0.0,0.0,8.0,9.0
C,0.0,0.0,12.0,13.0,0.0
D,0.0,16.0,17.0,0.0,0.0


In [19]:
#filling all NaN values with whatever the mean of my_df's original col2 is: ﴾1+6+11+16﴿/4 = 8.5
my_df[my_df % 2 == 0].fillna(value=my_df['col2'].mean())


Unnamed: 0,col1,col2,col3,col4,col5
A,0.0,8.5,2.0,8.5,4.0
B,8.5,6.0,8.5,8.0,8.5
C,10.0,8.5,12.0,8.5,14.0
D,8.5,16.0,8.5,18.0,8.5


In [20]:
#To add new columns to Data Frame
my_df['newCol'] = [10, 20, 30, 40] #must have the same amount of rows
my_df

Unnamed: 0,col1,col2,col3,col4,col5,newCol
A,0,1,2,3,4,10
B,5,6,7,8,9,20
C,10,11,12,13,14,30
D,15,16,17,18,19,40


In [21]:
my_df['col1 + col2'] = my_df['col1'] + my_df['col2']
my_df

Unnamed: 0,col1,col2,col3,col4,col5,newCol,col1 + col2
A,0,1,2,3,4,10,1
B,5,6,7,8,9,20,11
C,10,11,12,13,14,30,21
D,15,16,17,18,19,40,31


In [22]:
# To drop columns
my_df.drop(columns = ['newCol']) #This is done in place so changes are not stored unless specified to a new value
testing = my_df.drop(columns = ['newCol'])
testing

Unnamed: 0,col1,col2,col3,col4,col5,col1 + col2
A,0,1,2,3,4,1
B,5,6,7,8,9,11
C,10,11,12,13,14,21
D,15,16,17,18,19,31


In [23]:
# Or we can use additional parameter inplace = True to save the changes to the same Data Frame
my_df.drop(columns = ['newCol', 'col1 + col2'], inplace = True)
my_df

Unnamed: 0,col1,col2,col3,col4,col5
A,0,1,2,3,4
B,5,6,7,8,9
C,10,11,12,13,14
D,15,16,17,18,19


In [5]:
#Read in csv dataset using pandas 
# pd.read_csv(filepath_or_buffer = 'Pokemon.csv') or just do pd.read_csv(filepath)
pokemon_df = pd.read_csv('Pokemon.csv')
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [6]:
#To print a concise summary of our dataframe, we can use info()

pokemon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
#             800 non-null int64
Name          800 non-null object
Type 1        800 non-null object
Type 2        414 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [8]:
#There are many ways to take care of missing data, some of which are: remove the observations with null,
#remove the entire variable/column if not neccssary, develop other models to predict missing values,
#or replace missing values with most frequent

#Example of replacing with corresponding type 1 type
pokemon_df['Type 2'].fillna(pokemon_df['Type 1'], inplace = True)
pokemon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
#             800 non-null int64
Name          800 non-null object
Type 1        800 non-null object
Type 2        800 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [15]:
pokemon_df[pokemon_df['Legendary'] == True]
pokemon_df[pokemon_df['Legendary'] == True].shape

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
162,150,Mewtwo,Psychic,Psychic,680,106,110,90,154,90,130,1,True
163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [17]:
# Finding the pokemon with maxHP, we can do so by using idmax() which will tell us the index location of the max value

pokemon_df['HP'].idxmax()

261

In [18]:
pokemon_df.iloc[pokemon_df['HP'].idxmax()]

#                 242
Name          Blissey
Type 1         Normal
Type 2         Normal
Total             540
HP                255
Attack             10
Defense            10
Sp. Atk            75
Sp. Def           135
Speed              55
Generation          2
Legendary       False
Name: 261, dtype: object

In [20]:
#Another way to find the Pokemon with max HP in this example is using sort_values(by a value) and using head(1) to display top observation
pokemon_df.sort_values(by = 'HP', ascending = False).head(1)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
261,242,Blissey,Normal,Normal,540,255,10,10,75,135,55,2,False


In [21]:
#To find unique value in a column and their repetitions
pokemon_df['Type 1'].value_counts()

Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Electric     44
Rock         44
Dragon       32
Ghost        32
Ground       32
Dark         31
Poison       28
Fighting     27
Steel        27
Ice          24
Fairy        17
Flying        4
Name: Type 1, dtype: int64

In [25]:
pokemon_df.groupby(by = 'Type 1').mean()['HP']
#pokemon_df['HP'].mean()

Type 1
Bug         56.884058
Dark        66.806452
Dragon      83.312500
Electric    59.795455
Fairy       74.117647
Fighting    69.851852
Fire        69.903846
Flying      70.750000
Ghost       64.437500
Grass       67.271429
Ground      73.781250
Ice         72.000000
Normal      77.275510
Poison      67.250000
Psychic     70.631579
Rock        65.363636
Steel       65.222222
Water       72.062500
Name: HP, dtype: float64