<a href="https://colab.research.google.com/github/Saifullah785/python-data-science-handbook-notes/blob/main/03_06_Concat_And_Append.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Combining Datasets: Concat and append**

In [64]:
# Import necessary libraries: pandas for data manipulation and numpy for numerical operations
import pandas as pd
import numpy as np

In [65]:
# Define a function to quickly create a DataFrame
def make_df(cols, ind):
  """Quickly make a DataFrame"""
  # Create a dictionary where keys are column names and values are lists of strings
  data = {c: [str(c) + str(i) for i in ind]
          for c in cols}
  # Create and return a pandas DataFrame from the dictionary
  return pd.DataFrame(data, ind)

# example DataFrame: create a DataFrame with columns 'A', 'B', 'C' and index 0, 1, 2
make_df('ABC', range(3))
#

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [66]:
# Define a class to display HTML representation of multiple objects
class display(object):
    """Display HTML representation of multiple objects"""
    # HTML template for displaying objects side by side
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    # Initialize the class with a variable number of arguments (object names as strings)
    def __init__(self, *args):
        self.args = args

    # Method for displaying HTML representation in environments like Jupyter/Colab
    def _repr_html_(self):
        # Generate HTML for each object by evaluating its string name and getting its HTML representation
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)

    # Method for displaying the standard representation of the objects
    def __repr__(self):
        # Generate standard representation for each object
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# **Recall: Concatenation of Numpy Arrays**

In [67]:
# Create three lists
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

# Concatenate the three lists using numpy.concatenate
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [68]:
# Create a 2x2 numpy array
x = [[1, 2],
     [3, 4]]

# Concatenate the array with itself along axis 1 (columns)
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

# Simple Concatenation with pd.concat

The pd.concat function provides a similar syntax to np.concatenate but contains a number of options that we'll discuss momentarily:

# Signature in Pandas v1.3.5
pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False,
          sort=False, copy=True)
          
pd.concat can be used for a simple concatenation of Series or DataFrame objects, just as np.concatenate can be used for simple concatenations of arrays:

In [69]:
# Create two pandas Series with specified indices
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
# Concatenate the two Series using pd.concat
pd.concat([ser1, ser2])

Unnamed: 0,0
1,A
2,B
3,C
4,D
5,E
6,F


In [70]:
# Create two DataFrames using the make_df function
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
# Display the two DataFrames and the result of concatenating them
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [71]:
# Create two DataFrames with different columns but the same index
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
# Display the two DataFrames and the result of concatenating them along axis 1 (columns)
display('df3', 'df4', "pd.concat([df3, df4], axis=1)")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


# **Duplicate Indices**

In [72]:
# Create two DataFrames
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
# Assign the index of x to y, creating duplicate indices
y.index = x.index  # make duplicate indices!
# Display the two DataFrames and the result of concatenating them with duplicate indices
display('x', 'y', 'pd.concat([x, y])')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


**Treating repeated indices as an error**

In [73]:
# Attempt to concatenate DataFrames with duplicate indices and verify integrity
try:
  pd.concat([x, y], verify_integrity=True)
# Catch the ValueError that occurs due to duplicate indices
except ValueError as e:
  # Print the error message
  print("ValueError:", e)

ValueError: Indexes have overlapping values: Index([0, 1], dtype='int64')


# **Ignoring the index**

In [74]:
# Display the two DataFrames and the result of concatenating them while ignoring the original index
display('x', 'y', 'pd.concat([x, y], ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


# **Adding MultiIndex Keys**

In [75]:
# Display the two DataFrames and the result of concatenating them with multi-index keys
display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


#**Concatenation with Joins**

In [76]:
# Create two DataFrames with some overlapping and some different columns
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
# Display the two DataFrames and the result of concatenating them (outer join by default)
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [77]:
# Display the two DataFrames and the result of concatenating them using an inner join
display('df5', 'df6',
        "pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [78]:
# Concatenate df5 and df6 after reindexing df6 to match the columns of df6 (no change in this case)
pd.concat([df5, df6.reindex(df6.columns, axis=1)])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


# **The append Method**

In [79]:
# Display df1, df2, and the result of concatenating them using pd.concat
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
