1. How to import pandas and check the version?

In [1]:
import pandas as pd
import numpy as np

In [2]:
print(pd.__version__)

0.25.1


2. How to create a series from a list, numpy array and dict?

In [3]:
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

In [5]:
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)

In [8]:
print(ser1)
print(ser2)
print(ser3)

0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object
0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
dtype: int64
a     0
b     1
c     2
e     3
d     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int64


3. How to convert the index of a series into a column of a dataframe?

In [9]:
# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


4. How to combine many series to form a dataframe?

In [None]:
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [13]:
df = pd.DataFrame({'col1':ser1, 'col2': ser2})
df

Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


5. How to assign name to the series’ index?

In [14]:
# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# output
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

## 6. How to get the items of series A not present in series B?

In [19]:
# input
ser1 = pd.Series([1,2,3,4,5])
ser2 = pd.Series([4,5,6,7,8])

# output
ser1[~ser1.isin(ser2)]


0    1
1    2
2    3
dtype: int64

## 7. How to get the items not common to both series A and series B?

In [26]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# output
ser_u = pd.Series(np.union1d(ser1,ser2)) # union
ser_i = pd.Series(np.intersect1d(ser1, ser2)) # intersect

ser1[~ser_u.isin(ser_i)]

0    1
1    2
2    3
dtype: int64

## 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [27]:
ser = pd.Series(np.random.normal(10, 5, 25))

np.percentile(ser, q=[0,25,50,75,100])

array([ 4.83139266,  6.38250245,  8.64618273, 12.77014726, 17.6913769 ])

# 9. How to get frequency counts of unique items of a series?

In [31]:
# Input
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser

# output
ser.value_counts()

b    6
f    6
h    4
c    4
g    3
d    3
e    2
a    2
dtype: int64

## 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [23]:

ser = pd.Series([1,1,1,2,2,2,2,2,4,5,6,4,5,6])
#print(ser)

# Output
#print(ser.value_counts())
top2 = ser.value_counts()[0:2]
print(top2)
ser[~ser.isin(ser.value_counts().index[:2])] ='other'
ser

2    5
1    3
dtype: int64


0         1
1         1
2         1
3         2
4         2
5         2
6         2
7         2
8     other
9     other
10    other
11    other
12    other
13    other
dtype: object

# 11. How to convert a numpy array to a dataframe of given shape?

In [25]:
# Input
ser = pd.Series(np.random.randint(1, 10, 35))

# Output
pd.DataFrame(ser.values.reshape(7,5))

Unnamed: 0,0,1,2,3,4
0,8,7,8,8,5
1,2,7,7,1,1
2,1,7,9,3,7
3,1,9,2,9,7
4,5,3,3,8,8
5,6,2,6,4,8
6,2,6,5,2,9


# 12. How to find the positions of numbers that are multiples of 3 from a series?

In [31]:
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)
# Output
index = ser[ser%3==0].index
index

0    6
1    3
2    2
3    4
4    2
5    5
6    4
dtype: int64


Int64Index([0, 1], dtype='int64')

## 14. How to extract items at given positions from a series

In [33]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

## 15. How to stack two series vertically and horizontally ?

In [39]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcdef'))

# Vertically
ser1.append(ser2)
# or
print(pd.concat([ser1, ser2], axis=0))

# Horizantaly
pd.concat([ser1, ser2], axis=1)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


Unnamed: 0,0,1
0,0.0,a
1,1.0,b
2,2.0,c
3,3.0,d
4,4.0,e
5,,f


## 16. How to get the positions of items of series A in another series B?

In [58]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

ser1[ser1.isin(ser2)==True].index

Int64Index([0, 4, 5, 8], dtype='int64')

## 17. How to compute the mean squared error on a truth and predicted series?

In [59]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.339613541859732

## 18. How to convert the first character of each element in a series to uppercase?

In [69]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

ser.str.title()

# OR

ser.map(lambda x : x.title())


0     How
1      To
2    Kick
3    Ass?
dtype: object

## 19. How to calculate the number of characters in each word in a series?

In [75]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

## 20. How to compute difference of differences between consequtive numbers of a series?



In [76]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

ser.diff()

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    6.0
7    8.0
dtype: float64

# 21. How to convert a series of date-strings to a timeseries?

In [3]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
print(ser)
pd.to_datetime(ser)

0         01 Jan 2010
1          02-02-2011
2            20120303
3          2013/04/04
4          2014-05-05
5    2015-06-06T12:20
dtype: object


0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [14]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
date = pd.to_datetime(ser)

# day of month
date.dt.day.tolist()

# week number
date.dt.weekofyear.tolist()

# day of year
date.dt.dayofyear.tolist()

# day of week
date.dt.dayofweek.tolist()

[4, 2, 5, 3, 0, 5]

# 23. How to convert year-month string to dates corresponding to the 4th day of the month?

In [19]:
from dateutil.parser import parse
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

date = ser.map(lambda x: parse(x))
#date

output = date.dt.year.astype('str')+'-'+date.dt.month.astype('str')+'-'+'4'
output

0    2010-1-4
1    2011-2-4
2    2012-3-4
dtype: object

# 25. How to filter valid emails from a series?



In [20]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
emails

import re
pattern = '[a-zA-Z0-9]+@[a-z]+.[a-z]+'
regex = re.compile(pattern, flags=re.I)

mask = emails.map(lambda x:bool(regex.match(x)))
emails[mask]


1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

# 26. How to get the mean of a series grouped by another series?

In [22]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'banana', 'carrot', 'carrot', 'apple', 'carrot', 'carrot', 'carrot', 'apple', 'banana']


In [25]:
weights.groupby(fruit).mean()

apple     5.0
banana    6.0
carrot    5.6
dtype: float64

## 27. How to compute the euclidean distance between two series?

In [26]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

sum((p-q)**2)**0.5

18.16590212458495

## 29. How to replace missing spaces in a string with the least frequent character?

In [None]:
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

## 36. How to import only specified columns from a csv file?

In [None]:
df = pd.read_csv('filename', usecols=['c1', c2])

## 37. How to get the nrows, ncolumns, datatype, summary stats of each column of a dataframe? Also get the array and list equivalent.

In [None]:
#  number of rows and columns
print(df.shape)

# datatypes
print(df.dtypes)

# how many columns under each dtype
print(df.get_dtype_counts())
print(df.dtypes.value_counts())

# summary statistics
df_stats = df.describe()

# numpy array 
df_arr = df.values

# list
df_list = df.values.tolist()

## 38. How to extract the row and column number of a particular cell with given criterion?
### Which manufacturer, model and type has the highest Price? What is the row and column number of the cell with the highest Price value?

In [None]:
df.loc[df.Price==np.max(df.Price), ['Manufacturer', 'Model', 'Type']]

## 39. How to rename a specific columns in a dataframe?

In [None]:
df = df.rename(columns={"oldCol":"NewCol"})

# OR

df.columns.values[2] = "carType"


## 40. How to check if a dataframe has any missing values?

In [None]:
df.isnull().values.any()

## 41. How to count the number of missing values in each column?