In [1]:
import numpy as np
import pandas as pd

In [2]:
vk = pd.read_csv('kohli_ipl.csv', index_col = 'match_no')['runs']
vk.head(3)

match_no
1     1
2    23
3    13
Name: runs, dtype: int64

In [3]:
movies = pd.read_csv('bollywood.csv', index_col = 'movie')['lead']
movies.head(3)

movie
Uri: The Surgical Strike                Vicky Kaushal
Battalion 609                             Vicky Ahuja
The Accidental Prime Minister (film)      Anupam Kher
Name: lead, dtype: object

In [4]:
subs = pd.read_csv('subs.csv').squeeze('columns')
subs.head(3)

0    48
1    57
2    40
Name: Subscribers gained, dtype: int64

In [7]:
# we can convert data types to other types, for kohli runs int08 is sufficient because we can save so much size
import sys
print(sys.getsizeof(vk))
sys.getsizeof(vk.astype('int8'))

3472


1967

In [11]:
# to find how many times kohli scored above 51 and below 99 including both
vk[vk.between(51, 99)].count()

43

In [12]:
# clip same makes below one value same and above one same
subs

0       48
1       57
2       40
3       43
4       44
      ... 
360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [13]:
subs.clip(100, 200)

0      100
1      100
2      100
3      100
4      100
      ... 
360    200
361    200
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [14]:
ser = pd.Series([1, 1, 1, 2, 2, 3, 3, 4, 4])
ser

0    1
1    1
2    1
3    2
4    2
5    3
6    3
7    4
8    4
dtype: int64

In [15]:
# we can drop duplicates sometimes for data it is necessary
ser.drop_duplicates()

0    1
3    2
5    3
7    4
dtype: int64

In [16]:
# if we want we can specify to drop first one or last one(bydefault first one be there)
ser.drop_duplicates(keep = 'last')

2    1
4    2
6    3
8    4
dtype: int64

In [18]:
# we can even find whetehr a value is repeated or not, this gives after the first one if the same value appears again gives true
ser.duplicated()

0    False
1     True
2     True
3    False
4     True
5    False
6     True
7    False
8     True
dtype: bool

In [20]:
# we can even count how many duplicates are there
ser.duplicated().sum()

5

In [21]:
temp = pd.Series([1, 2, np.nan, 4, 5, np.nan, 7, 8, np.nan, 10])
temp

0     1.0
1     2.0
2     NaN
3     4.0
4     5.0
5     NaN
6     7.0
7     8.0
8     NaN
9    10.0
dtype: float64

In [22]:
# to find is there any null values or not
temp.isna()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
9    False
dtype: bool

In [23]:
# to find the total no.of missing values
temp.isnull().sum()

3

In [24]:
# we can use both isna or isnull 
# we can even remove those values if we want
temp.dropna()

0     1.0
1     2.0
3     4.0
4     5.0
6     7.0
7     8.0
9    10.0
dtype: float64

In [25]:
# we can even fill with the values we want
temp.fillna(temp.mean())

0     1.000000
1     2.000000
2     5.285714
3     4.000000
4     5.000000
5     5.285714
6     7.000000
7     8.000000
8     5.285714
9    10.000000
dtype: float64

In [26]:
# to find whetehr a value is present or not(can be done using condiutionals but if we want to check for multiple values can be long
vk[vk.isin([20, 30, 40, 50, 60])]

match_no
15     50
21     20
43     30
124    20
138    20
146    30
182    50
210    30
212    20
Name: runs, dtype: int64

In [27]:
# apply can be used to do functionality for values very easily
# for example if we want only first name of the lead actors that too in capital letters, we can do below
movies.apply(lambda x: x.split()[0].upper())

movie
Uri: The Surgical Strike                  VICKY
Battalion 609                             VICKY
The Accidental Prime Minister (film)     ANUPAM
Why Cheat India                          EMRAAN
Evening Shadows                            MONA
                                         ...   
Hum Tumhare Hain Sanam                     SHAH
Aankhen (2002 film)                     AMITABH
Saathiya (film)                           VIVEK
Company (film)                             AJAY
Awara Paagal Deewana                     AKSHAY
Name: lead, Length: 1500, dtype: object

In [29]:
# if i want to divide subs as good day if subs got more than average and bad if less
subs.apply(lambda x: 'Good day' if x > subs.mean() else 'bad day').value_counts()

Subscribers gained
bad day     202
Good day    163
Name: count, dtype: int64

In [30]:
# difference between view and copy
new = vk.head()
new

match_no
1     1
2    23
3    13
4    12
5     1
Name: runs, dtype: int64

In [31]:
# above creates view if we chaneg in new then that will aslo change in original because we are viewing the data so
new[1] = 100
new

match_no
1    100
2     23
3     13
4     12
5      1
Name: runs, dtype: int64

In [32]:
vk

match_no
1      100
2       23
3       13
4       12
5        1
      ... 
211      0
212     20
213     73
214     25
215      7
Name: runs, Length: 215, dtype: int64

In [33]:
# but if we want to work on this seperately then use copy
new = vk.head().copy()
new

match_no
1    100
2     23
3     13
4     12
5      1
Name: runs, dtype: int64

In [34]:
new[1] = 0
new

match_no
1     0
2    23
3    13
4    12
5     1
Name: runs, dtype: int64

In [35]:
vk

match_no
1      100
2       23
3       13
4       12
5        1
      ... 
211      0
212     20
213     73
214     25
215      7
Name: runs, Length: 215, dtype: int64