In [2]:
import numpy as np
import pandas as pd

## a) Creating Series

### Series through list

In [2]:
lst = [1,2,3,4,5]
pd.Series(lst)

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Series through Numpy array

In [4]:
arr = np.array([1,2,3,4,5])
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Giving Index from our Side

In [6]:
pd.Series(data = ['Sanket', 'Sahil', 'Suyash'], index = [1,2,3])

1    Sanket
2     Sahil
3    Suyash
dtype: object

### Accessing Elements

In [7]:
s = pd.Series([10,20]).repeat([5,2]).reset_index(drop = True)
s

0    10
1    10
2    10
3    10
4    10
5    20
6    20
dtype: int64

In [10]:
s[5]

np.int64(20)

In [11]:
s[2:4]

2    10
3    10
dtype: int64

### b) Aggregate function on pandas Series

In [12]:
sr = pd.Series([10,20,30,40,50,60,100,80])
sr

0     10
1     20
2     30
3     40
4     50
5     60
6    100
7     80
dtype: int64

In [13]:
sr.agg([min,max,sum])

  sr.agg([min,max,sum])
  sr.agg([min,max,sum])
  sr.agg([min,max,sum])


min     10
max    100
sum    390
dtype: int64

### c.) Series absolute function

In [14]:
sr = pd.Series([10,-30,40,-80,-100])
sr.abs()

0     10
1     30
2     40
3     80
4    100
dtype: int64

### d) Appending Series

In [17]:
sr1 = pd.Series([1,-2,-3,-4,5])
sr2 = pd.Series([10,30,40,60])

result = pd.concat([sr1, sr2])
print(result)

0     1
1    -2
2    -3
3    -4
4     5
0    10
1    30
2    40
3    60
dtype: int64


## e) asType

In [18]:
sr1.astype('float')

0    1.0
1   -2.0
2   -3.0
3   -4.0
4    5.0
dtype: float64

### f) Between Function

In [19]:
sr1.between(2,5)

0    False
1    False
2    False
3    False
4     True
dtype: bool

### g) All String Functions that can be used to extract or modify texts in a Series

In [20]:
ser = pd.Series(['Sanket Kadam', 'Sahil Kshirsagar', 'Suyash Agre', 'Data Science'])

##### Upper and Lower Function

In [24]:
print(ser.str.upper())
print('-'*20)
print(ser.str.lower())

0        SANKET KADAM
1    SAHIL KSHIRSAGAR
2         SUYASH AGRE
3        DATA SCIENCE
dtype: object
--------------------
0        sanket kadam
1    sahil kshirsagar
2         suyash agre
3        data science
dtype: object


##### Length Function

In [25]:
ser.str.len()

0    12
1    16
2    11
3    12
dtype: int64

##### Strip Function

In [26]:
# will remove unwanted spaces
ser.str.strip()

0        Sanket Kadam
1    Sahil Kshirsagar
2         Suyash Agre
3        Data Science
dtype: object

##### Split Function

In [27]:
# split acc to space
ser.str.split()

0        [Sanket, Kadam]
1    [Sahil, Kshirsagar]
2         [Suyash, Agre]
3        [Data, Science]
dtype: object

##### Contains Function

In [28]:
ser.str.contains("@")

0    False
1    False
2    False
3    False
dtype: bool

##### Replace Function

In [29]:
ser.str.replace('@','')

0        Sanket Kadam
1    Sahil Kshirsagar
2         Suyash Agre
3        Data Science
dtype: object

##### Count Function

In [30]:
ser.str.count('a')

0    3
1    3
2    1
3    2
dtype: int64

##### Startswith and Endswith Function

In [31]:
ser.str.startswith("Sanke")

0     True
1    False
2    False
3    False
dtype: bool

##### Find Function

In [32]:
ser.str.find("Sanket")

0    0
1   -1
2   -1
3   -1
dtype: int64

### h) Converting a Series to List

In [33]:
ser.to_list()

['Sanket Kadam', 'Sahil Kshirsagar', 'Suyash Agre', 'Data Science']

## Dataframe

### a) Creating Data Frames

##### Creating Dataframe using List:

In [34]:
lst = ['Sanket', 'Sahil', 'Suyash', 'Swapnil', 'Pratik']
pd.DataFrame(lst)

Unnamed: 0,0
0,Sanket
1,Sahil
2,Suyash
3,Swapnil
4,Pratik


In [35]:
lst = [['Sanket',10], ['Sahil', 20], ['Suyash',30]]
pd.DataFrame(lst)

Unnamed: 0,0,1
0,Sanket,10
1,Sahil,20
2,Suyash,30


##### A DataFrame is a two-dimensional data structures, i.e , data is aligned in tabular fashion in rows and Columns. We can perform basic operations on rows/columns like selecting, deleting, adding, renaming

In [36]:
data = {'one'    : pd.Series([1,2,3,4]),
        'two'    : pd.Series([10,20,30,40]),
        'three'  : pd.Series([100,200,300,400])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


### b) Slicing DataFrame Using iloc and loc

##### Basic loc Operation

In [37]:
df.loc[1:2]

Unnamed: 0,one,two,three
1,2,20,200
2,3,30,300


##### Basic iloc Operation

In [39]:
# index Based
df.iloc[1:2]

Unnamed: 0,one,two,three
1,2,20,200


In [40]:
df.iloc[[0,2]]

Unnamed: 0,one,two,three
0,1,10,100
2,3,30,300


### c) Slicing Using Conditions

In [41]:
df[df['two'] > 20]

Unnamed: 0,one,two,three
2,3,30,300
3,4,40,400


In [42]:
# Selecting Columns based on other Column Conditon
df.loc[df['two'] > 20, ['three']]

Unnamed: 0,three
2,300
3,400


### c) Column Addition in DataFrame

In [43]:
l = [22,33,44,55]
df['four'] = l
df

Unnamed: 0,one,two,three,four
0,1,10,100,22
1,2,20,200,33
2,3,30,300,44
3,4,40,400,55


### d) Column Deletion in DataFrame

- Using del

In [44]:
del df['four']
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


- Using pop

In [48]:
df.pop('three')

0    100
1    200
2    300
3    400
Name: three, dtype: int64

### f) Pandas drop Function

In [49]:
data = {'one'    : pd.Series([1,2,3,4]),
        'two'    : pd.Series([10,20,30,40]),
        'three'  : pd.Series([100,200,300,400])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


- axis = 0  -> Rows
- axis = 1  -> Columns

In [50]:
df.drop([0,1],  axis = 0)

Unnamed: 0,one,two,three
2,3,30,300
3,4,40,400


In [51]:
df.drop(['one', 'three'], axis = 1)

Unnamed: 0,two
0,10
1,20
2,30
3,40


### g) Transposing a DataFrame

In [52]:
data = {'one'    : pd.Series([1,2,3,4]),
        'two'    : pd.Series([10,20,30,40]),
        'three'  : pd.Series([100,200,300,400])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


In [53]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400


### h) A set of more DataFrame Functionalities

In [54]:
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


##### 1. axes func

In [56]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three'], dtype='object')]

##### 2. ndim funct

In [57]:
df.ndim

2

##### 3.dtypes

In [58]:
df.dtypes

one      int64
two      int64
three    int64
dtype: object

#####  4. shape func

In [59]:
df.shape

(4, 3)

##### 5. head func

In [62]:
df.head(2)

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200


##### 6. tail func

In [63]:
df.tail(2)

Unnamed: 0,one,two,three
2,3,30,300
3,4,40,400


##### 7. empty func

In [64]:
df.empty

False

## Statistical or Mathematical Functions

In [65]:
data = {'one'    : pd.Series([1,2,3,4]),
        'two'    : pd.Series([10,20,30,40]),
        'three'  : pd.Series([100,200,300,400])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


In [66]:
df.sum()

one        10
two       100
three    1000
dtype: int64

In [67]:
df.mean()

one        2.5
two       25.0
three    250.0
dtype: float64

In [68]:
df.median()

one        2.5
two       25.0
three    250.0
dtype: float64

In [69]:
df.mode()

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


In [70]:
df.var()

one          1.666667
two        166.666667
three    16666.666667
dtype: float64

In [71]:
df.min()

one        1
two       10
three    100
dtype: int64

In [72]:
df.max()

one        4
two       40
three    400
dtype: int64

In [73]:
df.std()

one        1.290994
two       12.909944
three    129.099445
dtype: float64

### j) Describe Function

In [74]:
data = {'one'    : pd.Series([1,2,3,4]),
        'two'    : pd.Series([10,20,30,40]),
        'three'  : pd.Series([100,200,300,400])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400


In [75]:
df.describe()

Unnamed: 0,one,two,three
count,4.0,4.0,4.0
mean,2.5,25.0,250.0
std,1.290994,12.909944,129.099445
min,1.0,10.0,100.0
25%,1.75,17.5,175.0
50%,2.5,25.0,250.0
75%,3.25,32.5,325.0
max,4.0,40.0,400.0


## Working with csv files and basic data Analysis Using Pandas

### a) Reading csv

In [3]:
df = pd.read_csv("Football.csv")
df

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.40,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,Netherlands,Eredivisie,(UTR),Gyrano Kerk,24,0,2155,10,7.49,0.33,50,18,2.20,0.79,2020
656,Netherlands,Eredivisie,(AJA),Quincy Promes,18,2,1573,12,9.77,0.59,56,30,3.38,1.81,2020
657,Netherlands,Eredivisie,(PSV),Denzel Dumfries,25,0,2363,7,5.72,0.23,45,14,1.81,0.56,2020
658,Netherlands,Eredivisie,,Cyriel Dessers,26,0,2461,15,14.51,0.56,84,43,3.24,1.66,2020


### b) Pandas Info Func

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country                  660 non-null    object 
 1   League                   660 non-null    object 
 2   Club                     626 non-null    object 
 3   Player Names             660 non-null    object 
 4   Matches_Played           660 non-null    int64  
 5   Substitution             660 non-null    int64  
 6   Mins                     660 non-null    int64  
 7   Goals                    660 non-null    int64  
 8   xG                       660 non-null    float64
 9   xG Per Avg Match         660 non-null    float64
 10  Shots                    660 non-null    int64  
 11  OnTarget                 660 non-null    int64  
 12  Shots Per Avg Match      660 non-null    float64
 13  On Target Per Avg Match  660 non-null    float64
 14  Year                     6

### c) isnull() func

In [5]:
df.isnull().sum()

Country                     0
League                      0
Club                       34
Player Names                0
Matches_Played              0
Substitution                0
Mins                        0
Goals                       0
xG                          0
xG Per Avg Match            0
Shots                       0
OnTarget                    0
Shots Per Avg Match         0
On Target Per Avg Match     0
Year                        0
dtype: int64

### d) Quantile function to get specific percentile value

In [6]:
df.describe(percentiles = [0.80])

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
80%,32.0,6.0,2915.8,15.0,14.076,0.61,90.0,39.0,3.6,1.63,2020.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0


In [7]:
df['Mins'].quantile(0.50)

np.float64(2245.5)

### e) Copy Function

In [9]:
de = df.copy()
de.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016


### g) Unique and Nuique Function

In [11]:
df['Player Names'].unique()

array(['Juanmi Callejon', 'Antoine Griezmann', 'Luis Suarez',
       'Ruben Castro', 'Kevin Gameiro', 'Cristiano Ronaldo',
       'Karim Benzema', 'Neymar ', 'Iago Aspas', 'Sergi Enrich',
       'Aduriz ', 'Sandro Ramlrez', 'Lionel Messi', 'Gerard Moreno',
       'Morata', 'Wissam Ben Yedder', 'Willian Jose', 'Andone ',
       'Cedric Bakambu', 'Isco', 'Mohamed Salah', 'Gregoire Defrel',
       'Ciro Immobile', 'Nikola Kalinic', 'Dries Mertens',
       'Alejandro Gomez', 'Jose CallejOn', 'Iago Falque',
       'Giovanni Simeone', 'Mauro Icardi', 'Diego Falcinelli',
       'Cyril Thereau', 'Edin Dzeko', 'Lorenzo Insigne',
       'Fabio Quagliarella', 'Borriello ', 'Carlos Bacca',
       'Gonzalo Higuain', 'Keita Balde', 'Andrea Belotti', 'Fin Bartels',
       'Lars Stindl', 'Serge Gnabry', 'Wagner ', 'Andrej Kramaric',
       'Florian Niederlechner', 'Robert Lewandowski', 'Emil Forsberg',
       'Timo Werner', 'Nils Petersen', 'Vedad Ibisevic', 'Mario Gomez',
       'Maximilian Philipp',

In [12]:
df['Player Names'].nunique()

444

### h) dropna() func

In [13]:
df.isnull().sum()

Country                     0
League                      0
Club                       34
Player Names                0
Matches_Played              0
Substitution                0
Mins                        0
Goals                       0
xG                          0
xG Per Avg Match            0
Shots                       0
OnTarget                    0
Shots Per Avg Match         0
On Target Per Avg Match     0
Year                        0
dtype: int64

In [16]:
df.dropna(inplace = True)

In [17]:
df.isnull().sum()

Country                    0
League                     0
Club                       0
Player Names               0
Matches_Played             0
Substitution               0
Mins                       0
Goals                      0
xG                         0
xG Per Avg Match           0
Shots                      0
OnTarget                   0
Shots Per Avg Match        0
On Target Per Avg Match    0
Year                       0
dtype: int64

### i) Fillna func

In [18]:
df = pd.read_csv("Football.csv")
df

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.40,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,Netherlands,Eredivisie,(UTR),Gyrano Kerk,24,0,2155,10,7.49,0.33,50,18,2.20,0.79,2020
656,Netherlands,Eredivisie,(AJA),Quincy Promes,18,2,1573,12,9.77,0.59,56,30,3.38,1.81,2020
657,Netherlands,Eredivisie,(PSV),Denzel Dumfries,25,0,2363,7,5.72,0.23,45,14,1.81,0.56,2020
658,Netherlands,Eredivisie,,Cyriel Dessers,26,0,2461,15,14.51,0.56,84,43,3.24,1.66,2020


In [19]:
df.isnull().sum()

Country                     0
League                      0
Club                       34
Player Names                0
Matches_Played              0
Substitution                0
Mins                        0
Goals                       0
xG                          0
xG Per Avg Match            0
Shots                       0
OnTarget                    0
Shots Per Avg Match         0
On Target Per Avg Match     0
Year                        0
dtype: int64

## Pandas Profile Report

In [24]:
!pip install pandas-profiling

Collecting pandas-profiling
  Using cached pandas_profiling-3.2.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting joblib~=1.1.0 (from pandas-profiling)
  Using cached joblib-1.1.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting pydantic>=1.8.1 (from pandas-profiling)
  Using cached pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
Collecting visions==0.7.4 (from visions[type_image_path]==0.7.4->pandas-profiling)
  Using cached visions-0.7.4-py3-none-any.whl.metadata (5.9 kB)
Collecting htmlmin>=0.1.12 (from pandas-profiling)
  Using cached htmlmin-0.1.12.tar.gz (19 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting missingno>=0.4.2 (from pandas-profiling)
  Using cached missingno-0.5.2-py3-none

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikit-learn 1.5.1 requires joblib>=1.2.0, but you have joblib 1.1.1 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
!pip install pydantic==1.10.2

Collecting pydantic==1.10.2
  Downloading pydantic-1.10.2-py3-none-any.whl.metadata (140 kB)
     ---------------------------------------- 0.0/140.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/140.0 kB ? eta -:--:--
     ----- ---------------------------------- 20.5/140.0 kB ? eta -:--:--
     ------------- ----------------------- 51.2/140.0 kB 525.1 kB/s eta 0:00:01
     --------------------------------- ---- 122.9/140.0 kB 1.0 MB/s eta 0:00:01
     ------------------------------------ 140.0/140.0 kB 920.3 kB/s eta 0:00:00
Downloading pydantic-1.10.2-py3-none-any.whl (154 kB)
   ---------------------------------------- 0.0/154.6 kB ? eta -:--:--
   ---------------------------------------- 154.6/154.6 kB 4.7 MB/s eta 0:00:00
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.8.2
    Uninstalling pydantic-2.8.2:
      Successfully uninstalled pydantic-2.8.2
Successfully installed pydantic-1.10.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pydantic-settings 2.4.0 requires pydantic>=2.7.0, but you have pydantic 1.10.2 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import pandas_profiling as pp
import matplotlib

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.8/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.8/u/import-error

In [32]:
df = pd.read_csv("Football.csv")
df

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.40,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,Netherlands,Eredivisie,(UTR),Gyrano Kerk,24,0,2155,10,7.49,0.33,50,18,2.20,0.79,2020
656,Netherlands,Eredivisie,(AJA),Quincy Promes,18,2,1573,12,9.77,0.59,56,30,3.38,1.81,2020
657,Netherlands,Eredivisie,(PSV),Denzel Dumfries,25,0,2363,7,5.72,0.23,45,14,1.81,0.56,2020
658,Netherlands,Eredivisie,,Cyriel Dessers,26,0,2461,15,14.51,0.56,84,43,3.24,1.66,2020


In [33]:
report = pp.ProfileREport(df)

NameError: name 'pp' is not defined

In [None]:
report