#### Pandas Series Methods (Continuation)

In [1]:
import numpy as np
import pandas as pd
import sys

In [2]:
# Reading CSV files properly
subs = pd.read_csv('subs.csv').squeeze()
vk = pd.read_csv('kohli_ipl.csv', index_col='match_no').squeeze()
movies = pd.read_csv('bollywood.csv', index_col='movie').squeeze()

### Method 1: astype() - Type Conversion
# Used to convert data types to optimize memory usage

In [3]:
# Check original data type and size
print(f"Original dtype: {vk.dtype}")
print(f"Original size: {sys.getsizeof(vk)} bytes")

# Convert to smaller data type
vk_converted = vk.astype('int16')
print(f"\nConverted dtype: {vk_converted.dtype}")
print(f"Converted size: {sys.getsizeof(vk_converted)} bytes")
print(f"Memory saved: {sys.getsizeof(vk) - sys.getsizeof(vk_converted)} bytes")

Original dtype: int64
Original size: 3472 bytes

Converted dtype: int16
Converted size: 2182 bytes
Memory saved: 1290 bytes


In [4]:
# Another example - converting to float
vk_float = vk.astype('float64')
print(f"\nFloat dtype: {vk_float.dtype}")
print(vk_float.head())


Float dtype: float64
match_no
1     1.0
2    23.0
3    13.0
4    12.0
5     1.0
Name: runs, dtype: float64


### Method 2: between() - Range Checking
# Returns boolean Series showing which values fall within a range (inclusive)

In [21]:
# Check which scores are between 51 and 99 (half-centuries)
half_centuries_bool = vk.between(51, 99)
print("Boolean Series (first 10):")
print(half_centuries_bool.head(10))

print(f"\nTotal half-centuries: {half_centuries_bool.sum()}")# why we used sum because the between function returns boolean values and we need to calculate only the true values , but the size gives all the items including false values

Boolean Series (first 10):
match_no
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
Name: runs, dtype: bool

Total half-centuries: 43


In [6]:
# Get the actual scores that are half-centuries
half_centuries = vk[vk.between(51, 99)]
print("\nActual half-century scores:")
print(half_centuries)


Actual half-century scores:
match_no
34     58
41     71
44     56
45     67
52     70
57     57
68     73
71     51
73     58
74     65
80     57
81     93
82     99
85     56
97     67
99     73
103    51
104    62
110    82
116    75
117    79
119    80
122    52
127    75
129    54
131    54
132    62
134    64
137    55
141    58
144    57
145    92
148    68
152    70
160    84
162    67
175    72
178    90
188    72
197    51
198    53
209    58
213    73
Name: runs, dtype: int64


In [7]:
# More examples of between()

# Low scores (0 to 20)
low_scores = vk[vk.between(0, 20)]
print(f"\nLow scores (0-20): {low_scores.count()}")

# Medium scores (21 to 50)
medium_scores = vk[vk.between(21, 50)]
print(f"Medium scores (21-50): {medium_scores.count()}")

# High scores (51 to 99)
high_scores = vk[vk.between(51, 99)]
print(f"High scores (51-99): {high_scores.count()}")

# Centuries (100+)
centuries = vk[vk >= 100]
print(f"Centuries (100+): {centuries.count()}")


Low scores (0-20): 97
Medium scores (21-50): 70
High scores (51-99): 43
Centuries (100+): 5


In [8]:
# Practical example: Find "match-winning" scores (40-80 range)
match_winning = vk.between(40, 80)
print(f"\nMatch-winning scores (40-80): {match_winning.sum()}")
print(vk[match_winning].head(10))


Match-winning scores (40-80): 54
match_no
15    50
32    42
34    58
41    71
44    56
45    67
52    70
57    57
63    45
68    73
Name: runs, dtype: int64


In [15]:
## clip: clips value between a given range
# Eg clipping with range of (100-200): we are converting values less than 100 to 100 and values greater than 200 to 200
# if values are between 100 and 200 they will remain as it is
subs

0       48
1       57
2       40
3       43
4       44
      ... 
360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [16]:
subs.clip(100,200)

0      100
1      100
2      100
3      100
4      100
      ... 
360    200
361    200
362    155
363    144
364    172
Name: Subscribers gained, Length: 365, dtype: int64

In [22]:
##  drop_duplicates: Additional Flexibility ,it allows us to delete the first occurrence or the last occurrence
temp = pd.Series([1,1,2,2,3,3,4,4])
print(temp)
temp.drop_duplicates(keep='last')

0    1
1    1
2    2
3    2
4    3
5    3
6    4
7    4
dtype: int64


1    1
3    2
5    3
7    4
dtype: int64

In [27]:
temp.duplicated().sum() # tells if values are duplicated
## in the following output value at zero index has occurred only once so false
## at first index has occurred previously so True

np.int64(4)

In [26]:
movies.drop_duplicates()

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Sssshhh...                              Tanishaa Mukerji
Rules: Pyaar Ka Superhit Formula                  Tanuja
Right Here Right Now (film)                        Ankit
Talaash: The Hunt Begins...                Rakhee Gulzar
The Pink Mirror                          Edwin Fernandes
Name: lead, Length: 566, dtype: object

In [28]:
## isnull
temp = pd.Series([1,2,3,np.nan,5,6,np.nan,8,np.nan,10])
temp

0     1.0
1     2.0
2     3.0
3     NaN
4     5.0
5     6.0
6     NaN
7     8.0
8     NaN
9    10.0
dtype: float64

In [32]:
## isnull
## Difference between size and count is size counts all the values irrespective of null values
## while count counts only the non null values

print(temp.size) # total value count
print(temp.count()) # count counts only the not null values

10
7


In [34]:
(vk.isnull()).sum() # there is no missing values

np.int64(0)

In [35]:
(temp.isnull()).sum() # 3

np.int64(3)

In [36]:
## dropna : drop all the missing values
temp.dropna() #all na values are dropped


0     1.0
1     2.0
2     3.0
4     5.0
5     6.0
7     8.0
9    10.0
dtype: float64

In [38]:
## fillna : fill missing values
temp.fillna(temp.mean()) # we fill all missing values with avg or mean value

0     1.0
1     2.0
2     3.0
3     5.0
4     5.0
5     6.0
6     5.0
7     8.0
8     5.0
9    10.0
dtype: float64

In [49]:
## isin
# print(vk)
## has virat kohli ever being out on a specific run
## normal way is to use boolean condition
vk_data = vk[(vk == 49) | (vk == 99)]
print(vk_data) # this works fine

## But what if we need to check for multiple condition not only 49 and 99 but more
vk[vk.isin([49,99,69,0])] # he has being out on 49 runs only once as per the dataset

match_no
82    99
86    49
Name: runs, dtype: int64


match_no
8       0
82     99
86     49
87      0
91      0
93      0
130     0
135     0
206     0
207     0
211     0
Name: runs, dtype: int64

In [None]:
## apply: allows to implement custom logic

In [50]:
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

In [52]:
## Extracting actors first name, which must be in upper case
movies.apply(lambda x:x.split()[0].upper())

movie
Uri: The Surgical Strike                  VICKY
Battalion 609                             VICKY
The Accidental Prime Minister (film)     ANUPAM
Why Cheat India                          EMRAAN
Evening Shadows                            MONA
                                         ...   
Hum Tumhare Hain Sanam                     SHAH
Aankhen (2002 film)                     AMITABH
Saathiya (film)                           VIVEK
Company (film)                             AJAY
Awara Paagal Deewana                     AKSHAY
Name: lead, Length: 1500, dtype: object

In [56]:
## good day if subs gained were more than average, bad day if subs gained were less than average
subs.apply(lambda x: 'bad' if x < subs.mean() else 'good')

0       bad
1       bad
2       bad
3       bad
4       bad
       ... 
360    good
361    good
362    good
363    good
364    good
Name: Subscribers gained, Length: 365, dtype: object

In [61]:
## copy
## copy and view
new_data = vk.head()

In [67]:
new_data.iloc[0] = 100

In [68]:
vk

match_no
1      100
2       23
3       13
4       12
5        1
      ... 
211      0
212     20
213     73
214     25
215      7
Name: runs, Length: 215, dtype: int64

In [None]:
## when we use head or tail functions we get a view of the data not a copy
## if any changes made in the head or tail functions will affect the original one

In [71]:
new_copy = vk.head().copy() # it actually creates a new copy

In [75]:
new_copy[1] = 1
vk # still the original will not be affected 

match_no
1      100
2       23
3       13
4       12
5        1
      ... 
211      0
212     20
213     73
214     25
215      7
Name: runs, Length: 215, dtype: int64