In [1]:
# %load command1.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


In [2]:
df1 = pd.DataFrame({
    'name': ['A', 'B', 'C', 'D'],
    'math': [60,89,82,70],
    'physics': [66,95,83,66],
    'chemistry': [61,91,77,70]
})
df1

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70


In [3]:
df2 = pd.DataFrame({
    'name': ['E', 'F', 'G', 'H'],
    'math': [66,95,83,66],
    'physics': [60,89,82,70],
    'chemistry': [90,81,78,90]
})
df2

Unnamed: 0,name,math,physics,chemistry
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


In [4]:
# delaing with index and axis
pd.concat([df1, df2])
pd.concat([df1, df2], ignore_index=True)
pd.concat([df1, df2], axis=1)

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70
4,E,66,60,90
5,F,95,89,81
6,G,83,82,78
7,H,66,70,90


Unnamed: 0,name,math,physics,chemistry,name.1,math.1,physics.1,chemistry.1
0,A,60,66,61,E,66,60,90
1,B,89,95,91,F,95,89,81
2,C,82,83,77,G,83,82,78
3,D,70,66,70,H,66,70,90


In [5]:
# avoid duplicate indicies
try:
    pd.concat([df1, df2], verify_integrity=True)
except ValueError as e:
    print('ValueError:', e)

ValueError: Indexes have overlapping values: Int64Index([0, 1, 2, 3], dtype='int64')


In [6]:
# adding a hierarchical index with keys and names options
res=pd.concat([df1, df2], keys=['Year 1', 'Year 2'])
res
res.loc['Year 1']

Unnamed: 0,Unnamed: 1,name,math,physics,chemistry
Year 1,0,A,60,66,61
Year 1,1,B,89,95,91
Year 1,2,C,82,83,77
Year 1,3,D,70,66,70
Year 2,0,E,66,60,90
Year 2,1,F,95,89,81
Year 2,2,G,83,82,78
Year 2,3,H,66,70,90


Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70


In [7]:
pd.concat([df1, df2],
         keys=['Year 1', 'Year 2'],
         names=['Class', None])

pd.concat([df1, df2],
         keys=['Year 1', 'Year 2'],
         names=['Class', None]).reset_index(level=0)

# Pass a string to level
pd.concat(
    [df1, df2], 
    keys=['Year 1', 'Year 2'],
    names=['Class', None]).reset_index(level='Class')

Unnamed: 0_level_0,Unnamed: 1_level_0,name,math,physics,chemistry
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Year 1,0,A,60,66,61
Year 1,1,B,89,95,91
Year 1,2,C,82,83,77
Year 1,3,D,70,66,70
Year 2,0,E,66,60,90
Year 2,1,F,95,89,81
Year 2,2,G,83,82,78
Year 2,3,H,66,70,90


Unnamed: 0,Class,name,math,physics,chemistry
0,Year 1,A,60,66,61
1,Year 1,B,89,95,91
2,Year 1,C,82,83,77
3,Year 1,D,70,66,70
0,Year 2,E,66,60,90
1,Year 2,F,95,89,81
2,Year 2,G,83,82,78
3,Year 2,H,66,70,90


Unnamed: 0,Class,name,math,physics,chemistry
0,Year 1,A,60,66,61
1,Year 1,B,89,95,91
2,Year 1,C,82,83,77
3,Year 1,D,70,66,70
0,Year 2,E,66,60,90
1,Year 2,F,95,89,81
2,Year 2,G,83,82,78
3,Year 2,H,66,70,90


In [8]:
# columns matching and sorting
df1 = pd.DataFrame({
    'name': ['A', 'B', 'C', 'D'],
    'chemistry': [61,91,77,70],
    'physics': [66,95,83,66],
    'math': [60,89,82,70],
})
df1

df2 = pd.DataFrame({
    'name': ['E', 'F', 'G', 'H'],
    'math': [66,95,83,66],
    'physics': [60,89,82,70],
    'chemistry': [90,81,78,90]
})
df2

pd.concat([df1, df2])
pd.concat([df1, df2], sort=True)

Unnamed: 0,name,chemistry,physics,math
0,A,61,66,60
1,B,91,95,89
2,C,77,83,82
3,D,70,66,70


Unnamed: 0,name,math,physics,chemistry
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


Unnamed: 0,name,chemistry,physics,math
0,A,61,66,60
1,B,91,95,89
2,C,77,83,82
3,D,70,66,70
0,E,90,60,66
1,F,81,89,95
2,G,78,82,83
3,H,90,70,66


Unnamed: 0,chemistry,math,name,physics
0,61,60,A,66
1,91,89,B,95
2,77,82,C,83
3,70,70,D,66
0,90,66,E,60
1,81,95,F,89
2,78,83,G,82
3,90,66,H,70


In [9]:
# custom sort
custom_sort = ['math', 'chemistry', 'physics', 'name']
res = pd.concat([df1, df2])
res[custom_sort]

Unnamed: 0,math,chemistry,physics,name
0,60,61,66,A
1,89,91,95,B
2,82,77,83,C
3,70,70,66,D
0,66,90,60,E
1,95,81,89,F
2,83,78,82,G
3,66,90,70,H


### Loading and concatenating datasets from a bunch of CSV files

In [10]:
import os

os.listdir('sp3')

['Albany.csv',
 'Atlanta.csv',
 'BaltimoreWashington.csv',
 'Boise.csv',
 'Boston.csv',
 'BuffaloRochester.csv',
 'California.csv',
 'Charlotte.csv',
 'Chicago.csv',
 'CincinnatiDayton.csv',
 'Columbus.csv',
 'DallasFtWorth.csv',
 'Denver.csv',
 'Detroit.csv',
 'GrandRapids.csv',
 'GreatLakes.csv',
 'HarrisburgScranton.csv',
 'HartfordSpringfield.csv',
 'Houston.csv',
 'Indianapolis.csv',
 'Jacksonville.csv',
 'LasVegas.csv',
 'LosAngeles.csv',
 'Louisville.csv',
 'MiamiFtLauderdale.csv',
 'Midsouth.csv',
 'Nashville.csv',
 'NewOrleansMobile.csv',
 'NewYork.csv',
 'Northeast.csv',
 'NorthernNewEngland.csv',
 'Orlando.csv',
 'Philadelphia.csv',
 'PhoenixTucson.csv',
 'Pittsburgh.csv',
 'Plains.csv',
 'Portland.csv',
 'RaleighGreensboro.csv']

In [12]:
for dirpath, dirname, filename in os.walk('sp3'):
    print(f'{dirpath}/{dirname}/{filename}', end='')

sp3/[]/['Albany.csv', 'Atlanta.csv', 'BaltimoreWashington.csv', 'Boise.csv', 'Boston.csv', 'BuffaloRochester.csv', 'California.csv', 'Charlotte.csv', 'Chicago.csv', 'CincinnatiDayton.csv', 'Columbus.csv', 'DallasFtWorth.csv', 'Denver.csv', 'Detroit.csv', 'GrandRapids.csv', 'GreatLakes.csv', 'HarrisburgScranton.csv', 'HartfordSpringfield.csv', 'Houston.csv', 'Indianapolis.csv', 'Jacksonville.csv', 'LasVegas.csv', 'LosAngeles.csv', 'Louisville.csv', 'MiamiFtLauderdale.csv', 'Midsouth.csv', 'Nashville.csv', 'NewOrleansMobile.csv', 'NewYork.csv', 'Northeast.csv', 'NorthernNewEngland.csv', 'Orlando.csv', 'Philadelphia.csv', 'PhoenixTucson.csv', 'Pittsburgh.csv', 'Plains.csv', 'Portland.csv', 'RaleighGreensboro.csv']

In [24]:
from glob import glob

res=glob('sp3/*.csv')
res

['sp3\\Albany.csv',
 'sp3\\Atlanta.csv',
 'sp3\\BaltimoreWashington.csv',
 'sp3\\Boise.csv',
 'sp3\\Boston.csv',
 'sp3\\BuffaloRochester.csv',
 'sp3\\California.csv',
 'sp3\\Charlotte.csv',
 'sp3\\Chicago.csv',
 'sp3\\CincinnatiDayton.csv',
 'sp3\\Columbus.csv',
 'sp3\\DallasFtWorth.csv',
 'sp3\\Denver.csv',
 'sp3\\Detroit.csv',
 'sp3\\GrandRapids.csv',
 'sp3\\GreatLakes.csv',
 'sp3\\HarrisburgScranton.csv',
 'sp3\\HartfordSpringfield.csv',
 'sp3\\Houston.csv',
 'sp3\\Indianapolis.csv',
 'sp3\\Jacksonville.csv',
 'sp3\\LasVegas.csv',
 'sp3\\LosAngeles.csv',
 'sp3\\Louisville.csv',
 'sp3\\MiamiFtLauderdale.csv',
 'sp3\\Midsouth.csv',
 'sp3\\Nashville.csv',
 'sp3\\NewOrleansMobile.csv',
 'sp3\\NewYork.csv',
 'sp3\\Northeast.csv',
 'sp3\\NorthernNewEngland.csv',
 'sp3\\Orlando.csv',
 'sp3\\Philadelphia.csv',
 'sp3\\PhoenixTucson.csv',
 'sp3\\Pittsburgh.csv',
 'sp3\\Plains.csv',
 'sp3\\Portland.csv',
 'sp3\\RaleighGreensboro.csv']

In [27]:
all_csvfile=[pd.read_csv(file, encoding='utf8') for file in res]

In [28]:
dfs=pd.concat(all_csvfile)
dfs

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.00,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.00,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.00,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.00,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.00,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,7,2018-02-04,2.25,14635.63,82.78,5175.05,151.37,9226.43,9000.42,226.01,0.00,organic,2018,RaleighGreensboro
334,8,2018-01-28,1.90,13996.05,145.91,6494.37,156.02,7199.75,6924.72,275.03,0.00,organic,2018,RaleighGreensboro
335,9,2018-01-21,1.86,14627.48,167.64,5921.25,176.32,8362.27,8026.25,336.02,0.00,organic,2018,RaleighGreensboro
336,10,2018-01-14,1.53,23359.01,119.42,6900.26,219.92,16119.41,15798.76,320.65,0.00,organic,2018,RaleighGreensboro
