# Pandas Questions

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

#### How to convert the index of a series into a column of a dataframe?

In [None]:
pd.DataFrame(columns=ser.index)

#### Using List operator for strings

In [None]:
['abc']

In [None]:
list('abc')

#### How to create lags and leads of a column in a dataframe?

In [None]:
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))

In [None]:
df

In [None]:
df.a.shift(1)

#### How to get the n’th largest value of a column when grouped by another column?

In [None]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

In [None]:
df

In [None]:
df[df.groupby('fruit',).cumcount() == 1]

#### How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [None]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

In [None]:
ser.value_counts()[:2]

In [None]:
np.where(ser.isin(ser.value_counts().index[:2].tolist()),ser,"other")

#### How to find the positions of numbers that are multiples of 3 from a series?

In [None]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

In [None]:
np.where(ser%3 == 0)

In [None]:
np.argwhere(ser % 3==0)

#### How to reshape a dataframe

- Reshape only works for numpy array

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10,-1))
df.head(2)

In [None]:
pd.DataFrame(df.values.reshape(5,20))

#### How to replace both the diagonals of dataframe with 0

In [None]:
df

In [None]:
for x,y in list(zip(df.index,df.columns)):
    df.iloc[x,y] = 0
    
df

In [None]:
for x,y in list(zip(df.index,-df.columns-1)):
    df.iloc[x,y] = 0
df

In [None]:
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    df.iat[df.shape[0]-i-1, i] = 0

#### How to create a column that contains the penultimate value in each row?

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

In [None]:
df.apply(lambda x:x.sort_values().iloc[-2],axis = 1)

#### Which column contains the highest number of row-wise maximum values?

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
df

In [None]:
bool = df.rank(axis = 1) ==  df.shape[1]
bool.sum()

In [None]:
df.apply(np.argmax,axis =1).value_counts()

#### Which column contains the highest row-wise maximum values?

In [None]:
### np.argmax is a numpy function 
x = df[[1,2]]
np.argmax(x.values,axis=0)

In [None]:
df.apply(np.argmax,axis =1)

#### Which columns(not values) contains the 2nd highest row-wise maximum values?

In [None]:
np.random.seed(123)

df = pd.DataFrame(np.random.choice(100,50).reshape(5,10),columns=list('abcdefghij'))
df

In [None]:
df.rank(method='first',axis =1)

In [None]:
df.columns[np.where(df.rank(method='first',axis =1) == 9)[1]]

In [None]:
# alternative
df.apply(lambda x:x.sort_values(ascending = False).iloc[[1]].index[0],axis = 1)

#### How to reverse the rows of a dataframe?

In [None]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))
df

In [None]:
df.iloc[::-1, :]

#### How to reshape a dataframe to the largest possible square after removing the negative values? ----- LEFT

In [None]:
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
df

- flatten makes a 2d array to 1d array

In [None]:
 df[df > 0].values

In [None]:
arr = df[df > 0].values.flatten()
arr_qualified = arr[~np.isnan(arr)]
arr_qualified

In [None]:
ser = pd.Series(np.logspace(-2, 2, 30))
ser

#### How to get the last n rows of a dataframe with row sum > 100?

In [None]:
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
df

In [None]:
df[df.apply(sum,axis=1) > 100]

#### How to find the position of the nth largest value greater than a given value?

In [None]:
ser = pd.Series(np.random.randint(1, 100, 15))
ser

In [None]:
ser[ser > ser.mean()].sort_values().index[1]

In [None]:
np.argwhere(ser > ser.mean())[1]

#### How to get the row number of the nth largest value in a column?

In [None]:
np.random.seed(123)
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
df

In [None]:
df.apply(np.argsort,axis = 1)

In [None]:
### Argsort() -- imp
df[df.a.argsort() ==5]

In [None]:
df.apply(lambda x:x.sort_values(ascending = False).iloc[[1]].index[0],axis = 0)

In [None]:
df.a.argsort()[::-1][5]

#### How to create a primary key index by combining relevant columns?

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv',sep="," ,usecols=[0,1,2,3,5])
df.head(2)

In [None]:
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
df.head(2)

#### How to filter every nth row in a dataframe?

In [None]:
range(20)

In [None]:
df = pd.DataFrame(range(20),columns=['A'])
df.head(2)

In [None]:
df.iloc[np.where(df.index%20 == 0)].head(2)

- using :: in pandas

https://stackoverflow.com/questions/3453085/what-is-double-colon-in-python-when-subscripting-sequences

s[i:j:k] is, according to the documentation, "slice of s from i to j with step k". When i and j are absent, the whole sequence is assumed and thus s[::k] means "every k-th item".

In [None]:
df.iloc[::-2,:]

#### How to convert the first character of each element in a series to uppercase?

In [None]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

In [None]:
ser.str.title()

####  How to get the positions of items of series A in another series B?

In [None]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

In [None]:
for index,value in zip(ser1.index,ser1):
    if value in ser2.values:
        print(index,value)

In [None]:
for x in set(ser1) & set(ser2):
    print(x)
    #print(ser1[ser1 == x].index[0])

In [None]:
li = []
for x in ser2:
    t = pd.Index(ser1).get_loc(x)
    li.append(t)

In [None]:
li

#### How to compute difference of differences between consequtive numbers of a series?

In [None]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
ser

In [None]:
print(ser.diff().tolist())

#### How to replace missing spaces in a string with the least frequent character?

In [None]:
y_str = 'dbc deb abed gade'
y_str

In [None]:
x = pd.Series(list(y_str))
y = np.where(x == ' ',x.value_counts().index[-1],x).tolist()
np.hstack(y)

#### How to import only specified columns from a csv file?

In [None]:
f = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', usecols=['crim', 'medv'])
print(df.head())

#### How to change the order of columns of a dataframe?

In [None]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

In [None]:
def switch_columns(df, col1=None, col2=None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    return df[colnames]

In [None]:
df1 = switch_columns(df, 'a', 'c')
df1

#### How to swap two rows of a dataframe?

In [None]:
np.random.seed(123)
df = pd.DataFrame(np.arange(25).reshape(5, -1))

df

In [None]:
np.random.seed(123)
def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

swap_rows(df, 1, 2)

#### How to create a column that contains the penultimate value in each row?

In [None]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

In [None]:
# using sort_values & iloc(not loc)
df.apply(lambda x:x.sort_values().iloc[-2],axis= 1)

In [None]:
# using np.sort
df.apply(lambda x:np.sort(x)[-2],axis= 1)

In [None]:
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
df['penultimate'] = out
print(df)

#### How to get the positions where values of two columns match?

In [None]:
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})

# Solution
np.where(df.fruit1 == df.fruit2)

In [None]:
np.argwhere(df.fruit1 == df.fruit2)

####  How to get the frequency of unique values in the entire dataframe?

In [None]:
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))
print(df)
# Solution
pd.value_counts(df.values.ravel())

# 100 pandas puzzles

https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles.ipynb

#### 27. A DataFrame has a column of groups 'grps' and and column of numbers 'vals'. For example:

In [None]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})
df

In [None]:
grp = df.sort_values(['grps','vals']).groupby('grps').cumcount() 
df[grp > 1].groupby('grps').sum()

#### 30. Consider a DataFrame containing rows and columns of purely numerical data. Create a list of the row-column index locations of the 3 largest values.

In [None]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(1,500,50).reshape(10,5))
df

In [None]:
## column * row

df.unstack().sort_values()

In [None]:
df.unstack().sort_values()[-3:].index.tolist()

#### 32. Implement a rolling mean over groups with window size 3, which ignores NaN value. For example consider the following DataFrame:

In [None]:
df = pd.DataFrame({'group_': list('aabbabbbabab'),
                       'value': [1, 2, 3, np.nan, 2, 3, 
                                 np.nan, 1, 7, 3, np.nan, 8]})

In [None]:
df

In [None]:
df.fillna(method='ffill').value.rolling(window = 3).sum()

In [None]:
pd.DataFrame(np.where(df.group_ == 'a',df.value.fillna(3),np.where(df.group_ == 'b',df.value.fillna(3.6),"")),columns=['filler']).T

#### 42. In the Airline column, you can see some extra puctuation and symbols have appeared around the airline names. Pull out just the airline name. E.g. '(British Airways. )' should become 'British Airways'.

In [None]:
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
              'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', 
                               '12. Air France', '"Swiss Air"']})

In [None]:
df

## ---------------------------------------- OTHER ---------------------------------------------------------------

#### Constructing pandas DataFrame from values in variables gives “ValueError: If using all scalar values, you must pass an index”

In [None]:
df = pd.DataFrame({'A': [2], 'B': [1]})
df

-  Must pass index when passing values as scaler

In [None]:
df = pd.DataFrame({'A': 2, 'B': 1}, index=[0])
df

#### Re-ordering columns in pandas dataframe based on column name

In [None]:
df

In [None]:
df = pd.DataFrame(data = 1,index = [0],columns= list('eafvghtj'))
df

In [None]:
df.sort_index(axis=1)

#### Using partioning SQL concept in Python

In [None]:
df1 = pd.DataFrame( { 
    "Name" : ["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
    "City" : ["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"] } )

df1

In [None]:
df1.groupby(['Name','City']).nunique()

In [None]:
pd.DataFrame({'count' : df1.groupby( [ "Name", "City"] ).size()}).reset_index()

#### concatinating based on column values

In [None]:
df1['concat'] = df1['Name'] + "-" + df1['City']
df1

#### How to view all columns in a pandas dataframe

In [None]:
import pandas as pd
#pd.set_option('display.max_rows',25)
pd.set_option('display.max_columns',100 )
pd.set_option('display.width', 25)

In [None]:
pd.DataFrame(dict(zip(np.arange(100),list('a'*100))),index = np.arange(5))

#### Dropping rows with NULL vlaues

In [None]:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'), 
                   'vals': [12,345,3,1,45,np.nan,4,52,np.nan,23,235,21,57,3,87]})
df

In [None]:
df[df.isnull().any(axis = 1)]

#### np.where` vs apply(lamdba if else)

In [None]:
ser = np.where(cust2.age> 50 , 'senior', np.where(cust2.age>30, 'young','teen'))
ser1=  pd.Series(ser)


cust2['dummy_np.where'] = ser

t = cust2.age.apply(lambda x: 'senior' if x > 50 else ( 'young' if x > 30 else 'teen'))
#rint(t)
cust2['dummy_lambda']  = t

cust2.head()

#### Mapped Aggregation

In [None]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

In [None]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df2.groupby(mapping).sum()

#### AND vs &

In [None]:
True and False # work for only 1 argument at a time

In [None]:
ser  = pd.Series([1,0,1])
ser
(ser > 1) & (ser < 2) # works for an array of booleans

In [None]:
try:
    (ser > 1) and (ser < 2)
except:
    print('error')

#### Creating a mapper for each inflection value

In [None]:
df = pd.DataFrame({'col':list('aaaaabbbbaaaabbbb')})
df

In [None]:
df['mapper'] = df.ne(df.shift(1)).cumsum()

In [None]:
df

#### Bootstraping in Pandas

In [None]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

In [None]:
df.sample(n=10,replace=True)

#### Iterating over rows - iterrows

In [None]:
df.iterrows()

In [None]:
df

In [None]:
for x in df.iterrows():
    print(x)

In [None]:
for x,y in df.iterrows():
    print(df.data1[x],df.data1[y])

#### Rolling mean with a window of 3

In [None]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'key': list('A'*3 + 'B'*4 + 'C'*3),
                   'data1': np.arange(10)})
df

In [None]:
df['data1'].rolling(window= 3).mean()

#### Ranking

In [None]:
df.rank()

#### Converting a Dataframe to a dictionary/Json

In [None]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'key': list('A'*3 + 'B'*4 + 'C'*3),
                   'data1': np.arange(10)})
df

In [None]:
df.to_dict()

In [None]:
d = df.to_dict()
for x,y in d.items():
    print(x)

In [None]:
d['key']

#### Imputing multiple columns @ once

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[1,4,np.nan],'C':[1,5,np.nan]})
df

In [None]:
df.mean()

In [None]:
df.fillna(df.mean())

#### Numpy -  Append / Concatenate / vstack / hstack / column_stack

- Append takes in argument directly as array or list

In [None]:
np.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0)

In [None]:
a = [[1, 2, 3], [4, 5, 6]]
b= [[7, 8, 9]]

- Takes argument as tuple

In [None]:
np.vstack((a,b))

In [None]:
np.hstack(([1,2,3],[5,6,7]))

In [None]:
np.column_stack(([1,2,3],[5,6,7]))

#### Equals vs ==

In [None]:
df1 = pd.DataFrame({'A':[1,2,np.nan]})
df2 = pd.DataFrame({'B':[1,2,np.nan]})

In [None]:
try:
    df1 == df2
except:
    print('please use df1.equals(df2)')

In [None]:
df1.equals(df2)

#### Enumerate

In [None]:
for x in enumerate([3,45,78]):
    print(x)

#### Stack & Unstack

In [None]:
rng = np.random.RandomState(0)

df = pd.DataFrame({'data2': np.arange(10)*2,
                   'data1': np.arange(10)},index = list('abcdefghij'))
df.head(2)

In [None]:
df.unstack()

In [None]:
df.stack()

#### 17. Expand a Series of lists into a DataFrame - Kevin Markham trick

In [None]:
df = pd.DataFrame({'col_one':['a', 'b', 'c'], 'col_two':[[10, 40], [20, 50], [30, 60]]})
df

In [None]:
df_new = df.col_two.apply(pd.Series)
df_new

In [None]:
a = pd.DataFrame(np.random.rand(6,4))
a

In [None]:
a['k'] = list('abacab')

In [None]:
a

In [None]:
a.groupby('k')[0].transform('sum') # works like partitiion

#### Stack vs Unstack

In [None]:
df = pd.DataFrame(np.random.rand(4,5),columns=list('abcde'))
df

In [None]:
df.corr()

In [None]:
df1 = df.corr().stack()
df1

In [None]:
df1.unstack()

#### Collections --> Counter

In [None]:
from collections import OrderedDict,Counter

Counter([1,2,3,2])

In [None]:
Counter('rohana')