In [34]:
import pandas as pd
import numpy as np
import yaml

## Table of Contents
***
#### [1. Replace column values based on another dataframe python pandas](#1)
#### [2. Drop Nan value](#2)
#### [3. Selection elements of a list based on another 'True'/'False' list](#3)
#### [4. Flatten a list of lists](#4)
#### [5. Filter columns name](#5)
#### [6. Filter rows by str.match or str.contain function](#6)
#### [7. Replace Nan in a column with another columns](#7)
#### [8. Convert a string representation of a Dictionary to a dictionary?](#8)
#### [9. Flatten a dict like column into multiple columns](#9)

<a id=1></a>
# 1. Replace column values based on another dataframe python pandas

In [49]:
df1 = pd.DataFrame([["X",1,1,0], ["Y",0,1,0], ["Z",0,0,0], ["Y",0,0,0]],
                    columns=["Name","Nonprofit","Business", "Education"])    

df2 = pd.DataFrame([["Y",1,1], ["Z",1,1]],columns=["Name","Nonprofit", "Education"])  

In [5]:
df1

Unnamed: 0,Name,Nonprofit,Business,Education
0,X,1,1,0
1,Y,0,1,0
2,Z,0,0,0
3,Y,0,0,0


In [6]:
df2

Unnamed: 0,Name,Nonprofit,Education
0,Y,1,1
1,Z,1,1


In [33]:
# 1.1_method1: using pd.update()
df1 = df1.set_index('Name')
df2 = df2.set_index('Name')
df1.update(df2)
df1.reset_index(inplace=True)

In [40]:
# 1.2_method2: merge
df1 = df1.merge(df2,on='Name',how="left")
df1['Nonprofit_y'] = df1['Nonprofit_y'].fillna(df1['Nonprofit_x'])
df1['Education_y'] = df1['Education_y'].fillna(df1['Education_x'])
df1.drop(["Education_x","Nonprofit_x"],inplace=True,axis=1)
df1.rename(columns={'Education_y':'Education','Nonprofit_y':'Nonprofit'},inplace=True)

<a id=2></a>
# 2. Drop Nan value

In [57]:
df = pd.DataFrame(np.random.randn(10,3))
df.iloc[::2,0] = np.nan; df.iloc[::4,1] = np.nan; df.iloc[::3,2] = np.nan

In [60]:
df.dropna()

df.dropna(how='all')

df.dropna(thresh=2) 

df.dropna(subset=[1]) 
df[df[2].notnull()]
df[~df[2].isnull()]
df[~np.isnan(df[2])]

Unnamed: 0,0,1,2
1,-2.966789,1.179701,1.428787
2,,-0.886587,1.549733
3,-0.744843,0.823238,
5,-1.39845,1.236819,-0.324916
6,,0.627484,
7,-1.744728,-0.077998,-0.306381
9,1.655729,0.98749,


<a id=3></a>
# 3. Selection elements of a list based on another 'True'/'False' list

In [80]:
xs = ['sth1','sth2','sth3','sth4']
ys = [True, False, True, False]

# 3.1_method1: zip()
result = [x for x, y in zip(xs, ys) if y == True]
print(result)
# 3.2_method2: np.array()
np_xs = np.array(xs)
np_ys = np.array(ys)
print(np_xs[np_ys].tolist())
# 3.3_method3: compress
from itertools import compress
print(list(compress(xs, ys)))

['sth1', 'sth3']
['sth1', 'sth3']
['sth1', 'sth3']


<a id=4></a>
# 4. Flatten a list of lists

In [4]:
#The list of lists
list_of_lists = [range(4), range(7)]
print(list_of_lists)
#flatten the lists
flattened_list = [y for x in list_of_lists for y in x]

print(flattened_list)


import itertools
flattened_list  = list(itertools.chain(*list_of_lists))
print(flattened_list)

[range(0, 4), range(0, 7)]
[0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6]
[0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6]


<a id=5></a> 
 # 5. Filter columns name

In [18]:
import pandas as pd
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6]}
df = pd.DataFrame(data)

print(df.filter(like='spike'))

   spike-2
0        1
1        2
2        3


<a id=6></a>
# 6. Filter rows by str.match or str.contain function

In [10]:
data = ["1345677+@gmail.com", "2345678+556@gmail.com", 
        "Testing+22@gmail.com", "test223+22@gmail.com"]
df = pd.DataFrame(data, columns=["email"])

df[df.email.str.match("\d+\+.*")]'
df[df.email.str.contains('^\d+\+.*\@')]'

<a id=7></a>
# 7. Replace Nan in a column with another columns

In [51]:
ts_df = pd.DataFrame([[1,"YesQ",75,],
                      [1,"NoR",115,],
                      [1,"NoT",63,13],
                      [2,"YesT",43,71]],
                     columns=['File','heat','Farheit','Temp'])
ts_df

Unnamed: 0,File,heat,Farheit,Temp
0,1,YesQ,75,
1,1,NoR,115,
2,1,NoT,63,13.0
3,2,YesT,43,71.0


In [16]:
# 7.1_method1: .fillna()
ts_df.Temp.fillna(ts_df.Farheit, inplace=True)
del ts_df['Farheit']

In [21]:
# 7.2_method2: .loc replace
ts_df.loc[ts_df.Temp.isnull(), 'Temp'] = ts_df.Farheit
del ts_df['Farheit']

In [26]:
# 7.3_method2: .apply() with np.isnan()
ts_df.Temp = ts_df.apply(lambda x: x.Farheit if np.isnan(x.Temp) else x.Temp, axis=1)
del ts_df['Farheit']

<a id=8></a>
# 8. Convert a string representation of a Dictionary to a dictionary?

In [52]:
s = "{'muffin' : 'lolz', 'foo' : 'kitty'}"
print(type(s))
print(s)

<class 'str'>
{'muffin' : 'lolz', 'foo' : 'kitty'}


In [30]:
# 8.1_method1: ast.literal_eval()
import ast
print(type(ast.literal_eval("{'muffin' : 'lolz', 'foo' : 'kitty'}")),
      ast.literal_eval("{'muffin' : 'lolz', 'foo' : 'kitty'}"))

print('\n')

<class 'dict'> {'muffin': 'lolz', 'foo': 'kitty'}




In [41]:
# 8.2_method2: json.loads() 
import json
json_acceptable_string = s.replace("'","\"")
print(json_acceptable_string)
print(type(json_acceptable_string), json_acceptable_string)
print(type(json.loads(json_acceptable_string)), 
      json.loads(json_acceptable_string))

print('\n')

{"muffin" : "lolz", "foo" : "kitty"}
<class 'str'> {"muffin" : "lolz", "foo" : "kitty"}
<class 'dict'> {'muffin': 'lolz', 'foo': 'kitty'}




In [44]:
# 8.3_method3: json.loads() 
print(type(yaml.load(s, Loader=yaml.FullLoader)),
     yaml.load(s, Loader=yaml.FullLoader))=

<class 'dict'> {'muffin': 'lolz', 'foo': 'kitty'}


In [None]:
# 8.4_method4: json.loads() 
d = {}
elems  = list(filter(str.isalnum,s.split("'")))
print(elems)
values = elems[1::2]
keys   = elems[0::2]
d.update(zip(keys,values))
print(type(d),d)

<a id=9></a>
# 9. Flatten a dict like column into multiple columns

In [None]:
# 1. pd.Series
df[['a', 'b']] = output['ab'].apply(pd.Series)

In [None]:
# 2. json_normalized
df[['a', 'b']] = json_normalize(output['ab'])