# Pandas Library

### Pandas is a tool for data processing which helps in data analysis
### Pandas provides functions and methods to efficiently manipulate large datasets

## Data Structures in Pandas
### Series(One-Dimensional Array)
### DataFrame(Two-Dimensional Array)

In [1]:
# Series: is a one-dimensional array with labels. It can contain any data type including integers,
#         strings,floats, python objects and more

# Example,
#       Index: 1,2,3,4,5
#       Data: 'A','B','C','D','E'

In [2]:
# DataFrame: is a two-dimensional data structure with labels. We can use labels to locate data

In [3]:
# Pandas Tutorial

import pandas as pd

In [4]:
# check pandas version

print(pd.__version__)

1.5.3


In [5]:
# Series create, manipulate, query, delete

# Creating a Series from a list

my_array=[0,1,2,3,4]

Series1=pd.Series(my_array)
Series1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [6]:
# Create second Series, where order is [1,2,3,4,5]

order=[1,2,3,4,5]

In [7]:
Series2=pd.Series(my_array, index=order)
Series2

1    0
2    1
3    2
4    3
5    4
dtype: int64

In [8]:
# Where index is 1,2,3,4,5 and Data is 0,1,2,3,4

In [9]:
# Create Series with data as float, where index is ('a','b','c','d','e')


import numpy as np

In [10]:
# create a random Ndarray
n=np.random.randn(5)
index=['a','b','c','d','e']
Series2=pd.Series(n, index=index)

Series2

a   -1.224483
b    0.443447
c   -1.534844
d    0.300227
e    0.971604
dtype: float64

In [11]:
# Create Series from Dictionary

d={'a':1, 'b':2, 'c':3, 'd':4, 'e':5}      # d is dictionary
Series3=pd.Series(d)

Series3

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [12]:
# Create Series from Dictionary by Using Chelsea players and their jersey numbers

d={'Cole Palmer':20, 'Enzo Fernandes':8, 'Ben Chilwell':21, 'Nicholas Jackson':15, 'Moesis Caiseido':25}
Series4=pd.Series(d)

Series4

Cole Palmer         20
Enzo Fernandes       8
Ben Chilwell        21
Nicholas Jackson    15
Moesis Caiseido     25
dtype: int64

In [13]:
d={'Raheem Sterling':7, 'Christopher Nkunku':18, 'Wesley Fofana':24, 'Dewsbury Hall':22, 'Axel Disasi':2}

Series5=pd.Series(d)

Series5


Raheem Sterling        7
Christopher Nkunku    18
Wesley Fofana         24
Dewsbury Hall         22
Axel Disasi            2
dtype: int64

In [14]:
# You can modify the index of series
# Modify or change Series5 to Series4 and assign S4 with new index which is ['Noni Madueke', 'Levi Colwil', Mukailo Mudryk, 
# Amando Broja, 'Reece James']

Series4.index=['Noni Madueke', 'Levi Colwil', 'Mukailo Mudryk','Amando Broja','Reece James']

Series4

Noni Madueke      20
Levi Colwil        8
Mukailo Mudryk    21
Amando Broja      15
Reece James       25
dtype: int64

In [15]:
# Basic Slicing in Pandas

# Slice or print the first 3 elements of series4 index

Series4[:3]

Noni Madueke      20
Levi Colwil        8
Mukailo Mudryk    21
dtype: int64

In [16]:
# Slice or print the last 3 elements of series4 index

Series4[2:]

Mukailo Mudryk    21
Amando Broja      15
Reece James       25
dtype: int64

In [17]:
# Slice or print the first 3 elements of series4 index

Series4[:-2]

Noni Madueke      20
Levi Colwil        8
Mukailo Mudryk    21
dtype: int64

In [18]:
# Slice or print the last 2 elements of series4 index

Series4[-2:]

Amando Broja    15
Reece James     25
dtype: int64

In [19]:
# Appending Series in Pandas

# Append Series4 with Series5

Series6=Series4.append(Series5)

Series6

  Series6=Series4.append(Series5)


Noni Madueke          20
Levi Colwil            8
Mukailo Mudryk        21
Amando Broja          15
Reece James           25
Raheem Sterling        7
Christopher Nkunku    18
Wesley Fofana         24
Dewsbury Hall         22
Axel Disasi            2
dtype: int64

In [20]:
# We can also use concatenate to combine Series4 and Series5

Series6=pd.concat([Series4, Series5])

Series6

Noni Madueke          20
Levi Colwil            8
Mukailo Mudryk        21
Amando Broja          15
Reece James           25
Raheem Sterling        7
Christopher Nkunku    18
Wesley Fofana         24
Dewsbury Hall         22
Axel Disasi            2
dtype: int64

In [21]:
# Drop or Delete an element in Series

# Drop element 'Levi Colwil' from the index series

Series6.drop('Axel Disasi')


Noni Madueke          20
Levi Colwil            8
Mukailo Mudryk        21
Amando Broja          15
Reece James           25
Raheem Sterling        7
Christopher Nkunku    18
Wesley Fofana         24
Dewsbury Hall         22
dtype: int64

In [22]:
# As observed above, 'Axel Disasi has been droped or removed from the Series6'

## Series Operations (append,add,substract,multiply,divide) in Pandas

In [23]:
# Assigning data to your Series
# Assign [14,20,10,9,19] to Series7

import pandas as pd

Westham = ['Mohammed Kudus', 'Jarrod Bowen', 'Lucas Paqueta', 'Michail Antonio', 'Edilson Alvares']
my_array = [14, 20, 10, 9, 19]

# Create Series7 from My_Array with Westham as index
Series7 = pd.Series(my_array, index=Westham)

# Display the result
print(Series7)


Mohammed Kudus     14
Jarrod Bowen       20
Lucas Paqueta      10
Michail Antonio     9
Edilson Alvares    19
dtype: int64


In [24]:
# OR

import pandas as pd

Westham = ['Mohammed Kudus', 'Jarrod Bowen', 'Lucas Paqueta', 'Michail Antonio', 'Edilson Alvares']
my_array = [14, 20, 10, 9, 19]

# Create Series7 from My_Array with Westham as index
Series7 = pd.Series(my_array, index=Westham)

# Display the result
Series7

Mohammed Kudus     14
Jarrod Bowen       20
Lucas Paqueta      10
Michail Antonio     9
Edilson Alvares    19
dtype: int64

In [25]:
# Assign [6,7,8,9,5]

array1=[0,1,2,3,4,5,7]
array2=[6,7,8,9,5]

Series8=pd.Series(array2)

Series8

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [26]:
# Assign [0,1,2,3,4,5,7] to Series9

Series9=pd.Series(array1)

Series9

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [27]:
# Adding Series

# Add Series 6 to Series 7

Series6.add(Series7)

Amando Broja         NaN
Axel Disasi          NaN
Christopher Nkunku   NaN
Dewsbury Hall        NaN
Edilson Alvares      NaN
Jarrod Bowen         NaN
Levi Colwil          NaN
Lucas Paqueta        NaN
Michail Antonio      NaN
Mohammed Kudus       NaN
Mukailo Mudryk       NaN
Noni Madueke         NaN
Raheem Sterling      NaN
Reece James          NaN
Wesley Fofana        NaN
dtype: float64

In [28]:
# Add Series8 to Series9

Series8.add(Series9)

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [29]:
# Substracting Series

# Substract Series9 from Series8(ie Series8-Series9abs)

Series8.sub(Series9)


0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [30]:
# Multiplying Series

# Multiply Series8 by Series9

Series8.mul(Series9)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [31]:
# Divide Series8 by Series9

Series8.div(Series9)

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

In [32]:
# Find the median of Series9

Series9.median()

3.0

In [33]:
# Find the mean of Series9

Series9.mean()

3.142857142857143

In [34]:
# Find the max of Series9

Series9.max()

7

In [35]:
# Find the min of Series9

Series9.min()

0

In [36]:
# Find the mode of Series9

Series9.mode()

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [37]:
# Print median, mean, max, min of Series9 simultaneosly

print(Series9.median())
print(Series9.mean())
print(Series9.max())
print(Series9.min())

3.0
3.142857142857143
7
0


In [38]:
# OR 
# Print median, mean, max, min, and mode of Series9 simultaneosly

print('median', Series9.median())
print('mean', Series9.mean())
print('max', Series9.max())
print('min', Series9.min())
print('mode', Series9.mode())

median 3.0
mean 3.142857142857143
max 7
min 0
mode 0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64


## Creating a DataFrame in Pandas

In [39]:
# Creating Dates

import pandas as pd

# Define time sequence as index

dates=pd.date_range('today', periods=6)

dates

DatetimeIndex(['2024-08-09 01:29:56.851112', '2024-08-10 01:29:56.851112',
               '2024-08-11 01:29:56.851112', '2024-08-12 01:29:56.851112',
               '2024-08-13 01:29:56.851112', '2024-08-14 01:29:56.851112'],
              dtype='datetime64[ns]', freq='D')

In [40]:
# DataFrame for dates and time for 2024-08-07 13:10 has been printed

In [41]:
# Add columns to the DataFrame


import pandas as pd
import numpy as np

# Generate a range of dates starting from today
dates = pd.date_range('today', periods=6)

# Generate a 6x4 array of random numbers
numpy_array = np.random.randn(6, 4)

# Define column names
columns = ['A', 'B', 'C', 'D']

# Create the DataFrame
df1 = pd.DataFrame(numpy_array, index=dates, columns=columns)

# Display the DataFrame
print(df1)


                                   A         B         C         D
2024-08-09 01:29:56.894135  1.490819  1.171902 -0.776356 -1.051720
2024-08-10 01:29:56.894135  1.015243 -0.211851  0.278530  2.200282
2024-08-11 01:29:56.894135 -0.802009 -0.487988  0.062470 -1.141457
2024-08-12 01:29:56.894135 -1.072665 -0.288889 -0.031201  0.524881
2024-08-13 01:29:56.894135  0.113252 -0.075220 -0.189184  0.767861
2024-08-14 01:29:56.894135 -1.685910 -1.033641 -1.435580  0.109049


In [42]:
# To make it physically appealing  

import pandas as pd
import numpy as np

# Generate a range of dates starting from today
dates = pd.date_range('today', periods=6)

# Generate a 6x4 array of random numbers
numpy_array = np.random.randn(6, 4)

# Define column names
columns = ['A', 'B', 'C', 'D']

# Create the DataFrame
df1 = pd.DataFrame(numpy_array, index=dates, columns=columns)

# Display the DataFrame
df1


Unnamed: 0,A,B,C,D
2024-08-09 01:29:56.915677,-0.860147,1.634124,-0.666685,0.69121
2024-08-10 01:29:56.915677,1.424836,-0.035901,0.593676,0.198952
2024-08-11 01:29:56.915677,1.555052,-1.57213,-1.986922,0.682734
2024-08-12 01:29:56.915677,0.073716,0.549217,-0.698802,-0.557588
2024-08-13 01:29:56.915677,-0.553364,0.38959,1.470599,-1.084005
2024-08-14 01:29:56.915677,-0.188265,1.049921,0.880794,1.15616


In [43]:
# Create a DataFrame for dates as index, and its corresponding  temperature, humidity, rainfall, snow
import pandas as pd
import numpy as np

dates=pd.date_range('today', periods=6)

# Create a numpy array
numpy_array=np.random.randn(6,4)

# Define column names
columns=['Temperature','Humidity','Rainfall','Snow']

# Create a DataFrame
df2=pd.DataFrame(numpy_array, index=dates, columns=columns) # where df2 is dataframe2

# print
df2

Unnamed: 0,Temperature,Humidity,Rainfall,Snow
2024-08-09 01:29:56.948679,1.734407,-0.930588,-2.034847,0.808431
2024-08-10 01:29:56.948679,-1.593212,1.345937,-0.225759,-1.20014
2024-08-11 01:29:56.948679,0.456003,0.957076,-2.118491,0.670177
2024-08-12 01:29:56.948679,0.746918,1.12013,0.994574,-0.254293
2024-08-13 01:29:56.948679,-0.768137,0.279072,-0.85685,0.907132
2024-08-14 01:29:56.948679,-2.393037,0.396331,0.189374,1.308875


In [44]:
# Create a DataFrame with Dictionary array


data={'Animal':['cat','cat','rabbit','dog','dog','cat','rabbit','cat','dog','dog'],
      'Age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
      'Visits':[1,3,2,3,2,3,1,1,2,1],
      'Priority':['yes','yes','no','yes','no','no','no','yes','no','no']}

labels=['a','b','c','d','e','f','g','h','i','j']
      
df3=pd.DataFrame(data,index=labels)
      
df3      

Unnamed: 0,Animal,Age,Visits,Priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,rabbit,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,rabbit,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [45]:
# Create a DataFrame for Chelsea Players for 2024/2025 Season with Dictionary array

# Print their nationality, ages, jersey numbers, position of play, and determine whether they will start matches or not

data={'Nationality':['England','England','Ukraine','France','Equador','Argentina','England','France','England','France'],
      
      'Age':[22,29,23,26,22,23,25,20,27,26],
      
      'Position of play':['Attacking Midfielder','Winger','Winger','Central Defender','Defensive Midfielder',
                         'Central Midfielder','Attacking Midfielder','Right Back','Left Back','Striker'],
      
      'Will start matches':['Yes','Yes','No','No','Yes','Yes','No','No','Yes','Yes']}

labels=['Cole Palmer','Raheem Sterling','Mukailo Mudryk','Axel Disasi','Moesis Caiseido','Enzo Fernandes',
 'Dewsbury Hall','Malo Gusto','Ben Chilwell','Christopher Nkunku']

df4=pd.DataFrame(data, index=labels)

df4

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [46]:
# Print the datatypes of Array

df4.dtypes

Nationality           object
Age                    int64
Position of play      object
Will start matches    object
dtype: object

In [47]:
# Print the datatype of age

print(df4['Age'].dtype)

int64


In [48]:
# Print the first 5 columns of the data

df4.head()

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes


In [49]:
# Print the top 2 columns of the data

df4.head(2)

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes


In [50]:
# Print the top 3 columns of the data

df4.head(3)

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [51]:
# Print the last 5 columns of the data

df4.tail()

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [52]:
# OR
# Print the last 5 columns of the data

df4.tail(5)

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [53]:
# Print the last 3 columns of the data

df4.tail(3)

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [54]:
# Print both index and columns of the dataframe

df4.index

Index(['Cole Palmer', 'Raheem Sterling', 'Mukailo Mudryk', 'Axel Disasi',
       'Moesis Caiseido', 'Enzo Fernandes', 'Dewsbury Hall', 'Malo Gusto',
       'Ben Chilwell', 'Christopher Nkunku'],
      dtype='object')

In [55]:
df4.values

array([['England', 22, 'Attacking Midfielder', 'Yes'],
       ['England', 29, 'Winger', 'Yes'],
       ['Ukraine', 23, 'Winger', 'No'],
       ['France', 26, 'Central Defender', 'No'],
       ['Equador', 22, 'Defensive Midfielder', 'Yes'],
       ['Argentina', 23, 'Central Midfielder', 'Yes'],
       ['England', 25, 'Attacking Midfielder', 'No'],
       ['France', 20, 'Right Back', 'No'],
       ['England', 27, 'Left Back', 'Yes'],
       ['France', 26, 'Striker', 'Yes']], dtype=object)

In [56]:
# Print both index and columns of the dataframe

print(df4.index)

df4.columns

Index(['Cole Palmer', 'Raheem Sterling', 'Mukailo Mudryk', 'Axel Disasi',
       'Moesis Caiseido', 'Enzo Fernandes', 'Dewsbury Hall', 'Malo Gusto',
       'Ben Chilwell', 'Christopher Nkunku'],
      dtype='object')


Index(['Nationality', 'Age', 'Position of play', 'Will start matches'], dtype='object')

In [57]:
df4.columns

Index(['Nationality', 'Age', 'Position of play', 'Will start matches'], dtype='object')

In [58]:
# Describing the Data

df4.describe()

Unnamed: 0,Age
count,10.0
mean,24.3
std,2.750757
min,20.0
25%,22.25
50%,24.0
75%,26.0
max,29.0


## Manipulating DataFrame

In [59]:
# Transpose(): Transpose is used to flip or swap columns and rows

# Flip or Swap the columns and rows of Chelsea Data

df4.T

Unnamed: 0,Cole Palmer,Raheem Sterling,Mukailo Mudryk,Axel Disasi,Moesis Caiseido,Enzo Fernandes,Dewsbury Hall,Malo Gusto,Ben Chilwell,Christopher Nkunku
Nationality,England,England,Ukraine,France,Equador,Argentina,England,France,England,France
Age,22,29,23,26,22,23,25,20,27,26
Position of play,Attacking Midfielder,Winger,Winger,Central Defender,Defensive Midfielder,Central Midfielder,Attacking Midfielder,Right Back,Left Back,Striker
Will start matches,Yes,Yes,No,No,Yes,Yes,No,No,Yes,Yes


In [60]:
# Sorting the Data

# sort the data by age
df4.sort_values(by='Age')

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Malo Gusto,France,20,Right Back,No
Cole Palmer,England,22,Attacking Midfielder,Yes
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Axel Disasi,France,26,Central Defender,No
Christopher Nkunku,France,26,Striker,Yes
Ben Chilwell,England,27,Left Back,Yes
Raheem Sterling,England,29,Winger,Yes


In [61]:
# Sorting the Data

# sort the data by Nationality
df4.sort_values(by='Nationality')

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Ben Chilwell,England,27,Left Back,Yes
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Axel Disasi,France,26,Central Defender,No
Malo Gusto,France,20,Right Back,No
Christopher Nkunku,France,26,Striker,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [62]:
# Sorting the Data

# sort the data by Position of Play
df4.sort_values(by='Position of play')

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Axel Disasi,France,26,Central Defender,No
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Ben Chilwell,England,27,Left Back,Yes
Malo Gusto,France,20,Right Back,No
Christopher Nkunku,France,26,Striker,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [63]:
# Sorting the Data

# sort the data by Position of Play
df4.sort_values(by='Will start matches')

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [64]:
# Slicing the DataFrame

# Slice the data between 1 and 3 columns of the data
# Note that numbers between 1 and 3. only positional index 1 and 2 are printed

df4[1:3]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [65]:
# Slice the first 3 columns of the data

df4[:3]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [66]:
# Slice the first 9 columns of the data

df4[:-1]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes


In [67]:
# Slice the first 9 columns of the data

df4[1:]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [68]:
# Slice the data between 1 and 3 by sorting age

df4.sort_values(by='Age')[1:3]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes


In [69]:
# Query the DataFrame

# Print the columns Nationality and Age only

df4[['Nationality','Age']]

Unnamed: 0,Nationality,Age
Cole Palmer,England,22
Raheem Sterling,England,29
Mukailo Mudryk,Ukraine,23
Axel Disasi,France,26
Moesis Caiseido,Equador,22
Enzo Fernandes,Argentina,23
Dewsbury Hall,England,25
Malo Gusto,France,20
Ben Chilwell,England,27
Christopher Nkunku,France,26


In [70]:
# Print the columns 'Position of play' and 'Will start matches' only

df4[['Position of play','Will start matches']]

Unnamed: 0,Position of play,Will start matches
Cole Palmer,Attacking Midfielder,Yes
Raheem Sterling,Winger,Yes
Mukailo Mudryk,Winger,No
Axel Disasi,Central Defender,No
Moesis Caiseido,Defensive Midfielder,Yes
Enzo Fernandes,Central Midfielder,Yes
Dewsbury Hall,Attacking Midfielder,No
Malo Gusto,Right Back,No
Ben Chilwell,Left Back,Yes
Christopher Nkunku,Striker,Yes


In [71]:
# Print the column Will start matches' only

df4[['Will start matches']]

Unnamed: 0,Will start matches
Cole Palmer,Yes
Raheem Sterling,Yes
Mukailo Mudryk,No
Axel Disasi,No
Moesis Caiseido,Yes
Enzo Fernandes,Yes
Dewsbury Hall,No
Malo Gusto,No
Ben Chilwell,Yes
Christopher Nkunku,Yes


In [72]:
# ilocation or integer location(iloc) can be used in similar way as slicing

# Query rows 2, amd 3

df4.iloc[1:3]

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No


In [73]:
# Generating a copy of a dataframe

df5=df4.copy()

df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22,Attacking Midfielder,Yes
Raheem Sterling,England,29,Winger,Yes
Mukailo Mudryk,Ukraine,23,Winger,No
Axel Disasi,France,26,Central Defender,No
Moesis Caiseido,Equador,22,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23,Central Midfielder,Yes
Dewsbury Hall,England,25,Attacking Midfielder,No
Malo Gusto,France,20,Right Back,No
Ben Chilwell,England,27,Left Back,Yes
Christopher Nkunku,France,26,Striker,Yes


In [74]:
# df4 has been copied and pasted to df5

In [75]:
# Print their nationality, ages, jersey numbers, position of play, and determine whether they will start matches or not

data={'Nationality':['England','England','Ukraine','France','Equador','Argentina','England','France','England',
                     'France','Spain','Belgium'],
      
      'Age':[22,29,23,np.nan,22,23,25,20,27,np.nan,26,20],
      
      'Position of play':['Attacking Midfielder','Winger','Winger','Central Defender','Defensive Midfielder',
                         'Central Midfielder','Attacking Midfielder','Right Back','Left Back','Striker',
                          'Left Back','Central Midfielder'],
      
      'Will start matches':['Yes','Yes','No','No','Yes','Yes','No','No','Yes','Yes','No','Yes']}

labels=['Cole Palmer','Raheem Sterling','Mukailo Mudryk','Axel Disasi','Moesis Caiseido','Enzo Fernandes',
 'Dewsbury Hall','Malo Gusto','Ben Chilwell','Christopher Nkunku','Mark Cucurella','Romeo Lavia']

df5=pd.DataFrame(data, index=labels)

df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,29.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Attacking Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,,Striker,Yes


In [76]:
# Looking for Null values in the DataFrame

df5.isnull()

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,False,False,False,False
Raheem Sterling,False,False,False,False
Mukailo Mudryk,False,False,False,False
Axel Disasi,False,True,False,False
Moesis Caiseido,False,False,False,False
Enzo Fernandes,False,False,False,False
Dewsbury Hall,False,False,False,False
Malo Gusto,False,False,False,False
Ben Chilwell,False,False,False,False
Christopher Nkunku,False,True,False,False


In [77]:
# NOTE: False means that the value is not null
#       True means that the value is null

In [78]:
# Modify/change the locations of the dataframe

# Change the of Raheem Sterling to 25

df5.loc['Raheem Sterling','Age']=25

df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Attacking Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,,Striker,Yes


In [79]:
# The age of Raheem Sterling has been changed from 29 to 25. 
# He is now younger than before. HAHAHA!

In [80]:
# Change the the position of Dewsbury Hall to a central midfielder

df5.loc['Dewsbury Hall','Position of play']='Cental Midfielder'

df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Cental Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,,Striker,Yes


In [81]:
# Find the mean of the age 

df5.mean()

  df5.mean()


Age    23.3
dtype: float64

In [82]:
# Print the sum of ages of Chelsea players

df5.sum()

Nationality           EnglandEnglandUkraineFranceEquadorArgentinaEng...
Age                                                               233.0
Position of play      Attacking MidfielderWingerWingerCentral Defend...
Will start matches                      YesYesNoNoYesYesNoNoYesYesNoYes
dtype: object

In [83]:
# Print the minmimun age of the players

df5.min()

Nationality                      Argentina
Age                                   20.0
Position of play      Attacking Midfielder
Will start matches                      No
dtype: object

In [84]:
# Print the maximum age of player

df5.max()

Nationality           Ukraine
Age                      27.0
Position of play       Winger
Will start matches        Yes
dtype: object

## Strings in DataFrame

In [85]:
# Print String in Series

liverpool=pd.Series(['Trend Alexander Anold','Mohammed Salah','Andy Roberson','Vigil Van Dyke',np.nan, 'Darwin Nunex',
                     'Dominic Soboslai', 'Ruben Diaz','Endo','Tsimikas'])

liverpool

0    Trend Alexander Anold
1           Mohammed Salah
2            Andy Roberson
3           Vigil Van Dyke
4                      NaN
5             Darwin Nunex
6         Dominic Soboslai
7               Ruben Diaz
8                     Endo
9                 Tsimikas
dtype: object

In [86]:
# Print the lowercase of liverpool

liverpool.str.lower()

0    trend alexander anold
1           mohammed salah
2            andy roberson
3           vigil van dyke
4                      NaN
5             darwin nunex
6         dominic soboslai
7               ruben diaz
8                     endo
9                 tsimikas
dtype: object

In [87]:
# All names has been printed in lowercase letters

In [88]:
# Print the uppercase of string liverpool

liverpool.str.upper()

0    TREND ALEXANDER ANOLD
1           MOHAMMED SALAH
2            ANDY ROBERSON
3           VIGIL VAN DYKE
4                      NaN
5             DARWIN NUNEX
6         DOMINIC SOBOSLAI
7               RUBEN DIAZ
8                     ENDO
9                 TSIMIKAS
dtype: object

In [89]:
# All names has been printed in uppercase letters

## Operations for DataFrame missing values

In [90]:
df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Cental Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,,Striker,Yes


In [91]:
# Changing the Null values

# Fill the NaN values with 26

# df5.fillna(26)

In [92]:
df5

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Cental Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,,Striker,Yes


In [93]:
df5.fillna(26)

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Axel Disasi,France,26.0,Central Defender,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Cental Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Christopher Nkunku,France,26.0,Striker,Yes


In [94]:
# All the Null values has been replaced with 26

In [95]:
# Print the mean of age

meanAge=df5['Age'].mean

df5['Age'].fillna(meanAge)

Cole Palmer                                                        22.0
Raheem Sterling                                                    25.0
Mukailo Mudryk                                                     23.0
Axel Disasi           <bound method NDFrame._add_numeric_operations....
Moesis Caiseido                                                    22.0
Enzo Fernandes                                                     23.0
Dewsbury Hall                                                      25.0
Malo Gusto                                                         20.0
Ben Chilwell                                                       27.0
Christopher Nkunku    <bound method NDFrame._add_numeric_operations....
Mark Cucurella                                                     26.0
Romeo Lavia                                                        20.0
Name: Age, dtype: object

In [96]:
# OR You can drop any Null value

df5.dropna()

Unnamed: 0,Nationality,Age,Position of play,Will start matches
Cole Palmer,England,22.0,Attacking Midfielder,Yes
Raheem Sterling,England,25.0,Winger,Yes
Mukailo Mudryk,Ukraine,23.0,Winger,No
Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
Dewsbury Hall,England,25.0,Cental Midfielder,No
Malo Gusto,France,20.0,Right Back,No
Ben Chilwell,England,27.0,Left Back,Yes
Mark Cucurella,Spain,26.0,Left Back,No
Romeo Lavia,Belgium,20.0,Central Midfielder,Yes


## DataFrame File Operations

In [97]:
# Save Chelsea to hardrive

df5.to_csv('chelsea.csv')

In [98]:
# Print the csv file of the first 3 lines

df_chelsea=pd.read_csv('chelsea.csv')

df_chelsea.head(3)

Unnamed: 0.1,Unnamed: 0,Nationality,Age,Position of play,Will start matches
0,Cole Palmer,England,22.0,Attacking Midfielder,Yes
1,Raheem Sterling,England,25.0,Winger,Yes
2,Mukailo Mudryk,Ukraine,23.0,Winger,No


In [99]:
# Save chelsea to hardrive as excel file

df5.to_excel('chelsea.xlsx',sheet_name='sheet1')
df_chelsea2=pd.read_excel('chelsea.xlsx','sheet1',index_col=None,na_values=['NA'])


df_chelsea2

Unnamed: 0.1,Unnamed: 0,Nationality,Age,Position of play,Will start matches
0,Cole Palmer,England,22.0,Attacking Midfielder,Yes
1,Raheem Sterling,England,25.0,Winger,Yes
2,Mukailo Mudryk,Ukraine,23.0,Winger,No
3,Axel Disasi,France,,Central Defender,No
4,Moesis Caiseido,Equador,22.0,Defensive Midfielder,Yes
5,Enzo Fernandes,Argentina,23.0,Central Midfielder,Yes
6,Dewsbury Hall,England,25.0,Cental Midfielder,No
7,Malo Gusto,France,20.0,Right Back,No
8,Ben Chilwell,England,27.0,Left Back,Yes
9,Christopher Nkunku,France,,Striker,Yes
