###Make_df function

In [6]:
import pandas as pd
import numpy as np


In [7]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    #print(data)
    return pd.DataFrame(data, ind)
 

##Combining DataSets

In [8]:
# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [9]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)
    

## Concating Series

In [10]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
print(ser1)
print(ser2)
pd.concat([ser1, ser2])

1    A
2    B
3    C
dtype: object
4    D
5    E
6    F
dtype: object


1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

## Concating DataFrames

In [11]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
#display('df1', 'df2', 'pd.concat([df1, df2])')
print(df1)
print()
print(df2)
pd.concat([df1, df2])   #By default concatenation takes place row-wise within the DataFrame (i.e., axis=0)

    A   B
1  A1  B1
2  A2  B2

    A   B
3  A3  B3
4  A4  B4


Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


## Columnwise concatination

In [12]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0,1])
print(df3)
print()
print(df4)
pd.concat([df3, df4], axis=1)   #Columnwise concatination

    A   B
0  A0  B0
1  A1  B1

    C   D
0  C0  D0
1  C1  D1


Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


## Dealing with Duplicate Indices

In [13]:
x = make_df('AB', [0, 1])
y = make_df('AB', [0, 1])
print(x)
print()
print(y)
print()


    A   B
0  A0  B0
1  A1  B1

    A   B
0  A0  B0
1  A1  B1



In [14]:
#y.index = x.index  # make duplicate indices!
#print(y)
pd.concat([x,y])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


## Duplicate indices is undesirable

### Use verify_integrity to check integrity of indices else raise error

In [15]:
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


### Create a new integer index in case of duplication

In [16]:
pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


# Concatenation with joins

## Outer join = having non common + common columns

In [17]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5)
print()
print(df6)
print()
pd.concat([df5,df6],sort=False)   #default behaviour is outer join, NaN will be used to represent missing data

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4



Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


## Inner join = common columns

In [18]:
pd.concat([df5,df6],join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


## Specifying target columns

In [19]:
print(df5)
print()
print(df6)
print()
#pd.concat([df5,df6],axis=1,join_axes=[df5.columns])
pd.concat([df5,df6.reindex(columns = df5.columns)],axis=1,ignore_index = True)



    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4



Unnamed: 0,0,1,2,3,4,5
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,,B3,C3
4,,,,,B4,C4


In [None]:
pd.concat([df5,df6],axis=1,join_axes=[df5.columns])

# Append()

In [20]:
print(df1)
print()
print(df2)
df1.append(df2)

    A   B
1  A1  B1
2  A2  B2

    A   B
3  A3  B3
4  A4  B4


Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


*   append() method in Pandas does not modify the original object–instead it **creates a new object** with the combined data
*   It also is **not a very efficient method**, because it **involves creation of a new index and data buffer**.
*   In case of multiple appends, better to use concat()



