1. How to import pandas and check the version?

In [1]:
import pandas as pd
print(pd.__version__)

2.2.2


2. How to create a series from a list, numpy array and dict?

In [10]:
import numpy as np
mylist=list("abcdefghijklmnopqrstuvwxyz")
myarr=np.arange(26)
mydict=dict(zip(mylist,myarr))

print(pd.Series(mylist).head())
print(pd.Series(myarr).head())
print(pd.Series(mydict).head())


0    a
1    b
2    c
3    d
4    e
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int64
a    0
b    1
c    2
d    3
e    4
dtype: int64


3. How to convert the index of a series into a column of a dataframe?

In [11]:
ser=pd.Series(mydict)
df=ser.to_frame().reset_index()
print(df)

   index   0
0      a   0
1      b   1
2      c   2
3      d   3
4      e   4
5      f   5
6      g   6
7      h   7
8      i   8
9      j   9
10     k  10
11     l  11
12     m  12
13     n  13
14     o  14
15     p  15
16     q  16
17     r  17
18     s  18
19     t  19
20     u  20
21     v  21
22     w  22
23     x  23
24     y  24
25     z  25


4. How to combine many series to form a dataframe?

In [15]:
ser1=pd.Series(list("abcdefghijklmnopqrstuvwxyz"))
ser2=pd.Series(np.arange(26))
df=pd.DataFrame({'c1':ser1,"c2":ser2})
print(df.head())

  c1  c2
0  a   0
1  b   1
2  c   2
3  d   3
4  e   4


5. How to assign name to the series’ index?

In [20]:
ser=pd.Series(list("abcdefghijklmnopqrstuvwxyz"))
ser.name="alphabets"
print(ser.head())

0    a
1    b
2    c
3    d
4    e
Name: alphabets, dtype: object


7. How to get the items not common to both series A and series B?

In [24]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
s1=set(ser1)
s2=set(ser2)
res=s1.symmetric_difference(s2)
print(res)

{1, 2, 3, 6, 7, 8}


8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [28]:
ser = pd.Series(np.random.normal(10, 5, 25))
print("minimum:",ser.min())
print("25th percentile:",ser.quantile(0.25))
print("median:",ser.median())
print("75th percentile:",ser.quantile(0.75))
print("maximum:",ser.max())

minimum: 0.12966039212199476
25th percentile: 6.862687426527527
median: 8.426893079761792
75th percentile: 11.728333904128037
maximum: 23.169906832067227


9. How to get frequency counts of unique items of a series?

In [31]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
print(ser.value_counts())

b    7
h    5
a    4
d    4
g    3
f    3
c    2
e    2
Name: count, dtype: int64


10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [34]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
top_2=ser.value_counts().nlargest(2).index
res=ser.where(ser.isin(top_2),other='other')
print(res)

0         4
1         4
2     other
3         3
4         3
5         4
6         3
7         4
8     other
9         4
10    other
11        3
dtype: object


11. How to bin a numeric series to 10 groups of equal size?

In [39]:
ser = pd.Series(np.random.random(20))
# print(ser.head())
print(pd.qcut(ser,10,labels=[f'{i+1}th' for i in range(10)]))

0      4th
1      3th
2      2th
3      3th
4     10th
5      7th
6      5th
7      5th
8      1th
9      4th
10     8th
11     9th
12     6th
13     8th
14     1th
15     9th
16     2th
17     7th
18    10th
19     6th
dtype: category
Categories (10, object): ['1th' < '2th' < '3th' < '4th' ... '7th' < '8th' < '9th' < '10th']


12. How to convert a numpy array to a dataframe of given shape? (L1)

In [45]:
ser = pd.Series(np.random.randint(1, 10, 35))
reshape_ser=ser.values.reshape((7,5))
print(reshape_ser)

[[2 2 2 9 7]
 [8 1 4 6 6]
 [1 9 7 8 7]
 [7 8 3 2 1]
 [9 7 3 7 1]
 [6 6 1 1 7]
 [3 2 9 2 7]]


13. How to find the positions of numbers that are multiples of 3 from a series?

In [51]:
import numpy as np
ser = pd.Series(np.random.randint(1, 10, 7))
print(np.argwhere(ser%3==0))

[[1]
 [5]]


14. How to extract items at given positions from a series

In [53]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
for i in pos:
    print(ser[i])
    #or
ser.take(pos)

a
e
i
o
u


0     a
4     e
8     i
14    o
20    u
dtype: object

15. How to stack two series vertically and horizontally ?

In [59]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
df=pd.DataFrame({"index":ser1,"value":ser2})
print(df)
# ser1.append(ser2)
df=pd.concat([ser1,ser2],axis=1)
print(df)

   index value
0      0     a
1      1     b
2      2     c
3      3     d
4      4     e
   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


16. How to get the positions of items of series A in another series B?

In [70]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
ind=[pd.Index(ser1).get_loc(i) for i in ser2]
print(ind)

[5, 4, 0, 8]


17. How to compute the mean squared error on a truth and predicted series?

In [72]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
# print(truth.head())
# print(pred.head())
print((sum(truth-pred)**2)/len(truth))

0    0
1    1
2    2
3    3
4    4
dtype: int64
0    0.497334
1    1.226948
2    2.058249
3    3.818912
4    4.475112
dtype: float64
1.9133179446881925


18. How to convert the first character of each element in a series to uppercase?

In [75]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
l1=[]
for i in range(len(ser)):
    l1.append(ser[i][0].upper()+ser[i][1:])
print(l1)

['How', 'To', 'Kick', 'Ass?']


19. How to calculate the number of characters in each word in a series?

In [79]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
res=ser.map(lambda val :len(val))
print(res)


0    3
1    2
2    4
3    4
dtype: int64


20. How to compute difference of differences between consequtive numbers of a series?

In [89]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
print(ser.diff().diff())

0    NaN
1    NaN
2    1.0
3    1.0
4    1.0
5    1.0
6    0.0
7    2.0
dtype: float64


21. How to convert a series of date-strings to a timeseries?

In [88]:
from dateutil.parser import parse
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser_parse=ser.map(lambda x:parse(x))
print(pd.to_datetime(ser_parse))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]


22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [97]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
print("Date: ", ser_ts.dt.day.tolist())
print("Week number: ", ser_ts.dt.isocalendar().week.tolist())#learned
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())
print("Day of week: ", ser_ts.dt.day_name().tolist())#learned

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


23. How to convert year-month string to dates corresponding to the 4th day of the month?

In [103]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
from dateutil.parser import parse 
set_set=ser.map(lambda x:parse(x).replace(day=4))
print("dates taht start with 4th of each month:",set_set)

dates taht start with 4th of each month: 0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]


24. How to filter words that contain atleast 2 vowels from a series?

In [104]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
res= ser[ser.str.findall(r'[aeiouAEIOU]').str.len()>=2]
print(res)

0     Apple
1    Orange
4     Money
dtype: object


25. How to filter valid emails from a series?

In [107]:
import re

emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
l=[]
for i in range(len(emails)):
    if "@" in emails[i]:
        l.append(emails[i])
print(l)
#or
print(emails.str.findall(pattern,re.IGNORECASE))

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']
0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object


26. How to get the mean of a series grouped by another series?

In [114]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())
fruits_weights_df=pd.DataFrame({"fruits":fruit,"weights":weights})
print(fruits_weights_df)
fruits_weights_df.groupby("fruits").mean("weights")


[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'apple', 'banana', 'banana', 'apple', 'apple', 'carrot', 'apple', 'carrot', 'apple']
   fruits  weights
0   apple      1.0
1   apple      2.0
2  banana      3.0
3  banana      4.0
4   apple      5.0
5   apple      6.0
6  carrot      7.0
7   apple      8.0
8  carrot      9.0
9   apple     10.0


Unnamed: 0_level_0,weights
fruits,Unnamed: 1_level_1
apple,5.333333
banana,3.5
carrot,8.0


27. How to compute the euclidean distance between two series?WITHOUT USING PACKAGED FORMULA(SQRT(X2-X1)**2+(Y2-Y1)**2)

In [121]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
e_dist=np.linalg.norm(p-q)
print(e_dist)

18.16590212458495


28. How to find all the local maxima (or peaks) in a numeric series?

In [123]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
# res=[]
# for i in range(len(ser)):
#     if ser[i-1]<ser[i] and ser[i+1]<ser[i]:
#         res.append(ser.index(i))
# print(res)
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs


array([1, 5, 7])

29. How to replace missing spaces in a string with the least frequent character?

In [132]:

from collections import Counter
my_str = 'dbc deb abed gade'
res= Counter(my_str)
res=min(res,key=res.get)
print(my_str.replace(" ",res))


dbccdebcabedcgade


30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [139]:
dates=pd.date_range("2000-01-01",periods=10,freq="W-SAT")
values=np.random.randint(0,10,size=10)

df=pd.Series(values,index=dates)
print(df)

2000-01-01    3
2000-01-08    8
2000-01-15    2
2000-01-22    7
2000-01-29    7
2000-02-05    0
2000-02-12    9
2000-02-19    6
2000-02-26    7
2000-03-04    5
Freq: W-SAT, dtype: int64
