#**Pandas**

**Pandas is an open source Python library for highly specialized data analysis. Currently it is the reference point that all professionals using the Python language need to study and analyze data sets for statistical purposes of analysis and decision making.**

In [2]:
import numpy as np
import pandas as pd

In [3]:
# series() for declaring series.
a = pd.Series([12, -3, 23, 9])
a


0    12
1    -3
2    23
3     9
dtype: int64

As you can see from the output of the Series, on the left there are the values in the Index, which is a
series of labels, and on the right the corresponding values.

In [4]:
# including index option for assigning an array of string containing the labels.
a = pd.Series([12, -3, 23, 9], index=['a', 'b', 'c', 'd'])
a

a    12
b    -3
c    23
d     9
dtype: int64

In [5]:
a.values

array([12, -3, 23,  9])

In [6]:
a.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
# selecting the internal element
a[2]

23

In [8]:
a['c']

23

In [9]:
a[0:2]

a    12
b    -3
dtype: int64

In [10]:
a[-1]

9

In [11]:
a[['b', 'd']]

b   -3
d    9
dtype: int64

In [12]:
# assignning Values to the element.
a[1] = 98
a

a    12
b    98
c    23
d     9
dtype: int64

In [13]:
a['c'] = -54
a

a    12
b    98
c   -54
d     9
dtype: int64

In [14]:
# defining series from numpy arrays and other series.
arr = np.array([1, 2, 3, 4, 5])
s1 = pd.Series(arr)
s1


0    1
1    2
2    3
3    4
4    5
dtype: int64

In [15]:
arr[2] = -33
arr

array([  1,   2, -33,   4,   5])

In [16]:
s1

0     1
1     2
2   -33
3     4
4     5
dtype: int64

In [17]:
# filtering values
a

a    12
b    98
c   -54
d     9
dtype: int64

In [18]:
a[a > 44]

b    98
dtype: int64

In [19]:
a[a <= 44]

a    12
c   -54
d     9
dtype: int64

Operations and mathematical functions

In [20]:
a / 2

a     6.0
b    49.0
c   -27.0
d     4.5
dtype: float64

In [21]:
np.log(a)

  result = getattr(ufunc, method)(*inputs, **kwargs)


a    2.484907
b    4.584967
c         NaN
d    2.197225
dtype: float64

In [22]:
b = pd.Series([1, 0, 2, 1, 2, 3], index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
b

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [23]:
b.unique()

array([1, 0, 2, 3])

In [24]:
b.value_counts()

2    2
1    2
3    1
0    1
dtype: int64

In [25]:
b.isin([0, 3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [26]:
b[b.isin([0, 3])]

white     0
yellow    3
dtype: int64

In [27]:
# np.NaN whenever we want to define a missing value.
b = pd.Series([5, -3, np.NaN, 14])
b

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [28]:
# isnull() and notnull() functions are very useful to identify the indexes without a value.
b.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [29]:
b.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [30]:
b[b.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [31]:
b[b.isnull()]

2   NaN
dtype: float64

###**Series as Dictionaries**

In [32]:
mydict = {'red' : 2000, 'blue': 100, 'yellow': 50000, 'orange': 1435}
myseries = pd.Series(mydict)
myseries

red        2000
blue        100
yellow    50000
orange     1435
dtype: int64

In [33]:
colors = ['red', 'yellow', 'orange', 'blue', 'green']
myseries = pd.Series(mydict, index=colors)
mydict

{'blue': 100, 'orange': 1435, 'red': 2000, 'yellow': 50000}

In [34]:
# defining dataframe using DataFrame() function
data = {'color' : ['blue', 'green', 'yellow', 'red', 'white'],
        'object': ['ball', 'pen', 'pencil', 'paper', 'mug'],
        'price' : [1.2, 3.4, 5.6, 0.8, 2.4]}

In [35]:
frame = pd.DataFrame(data)
frame


Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,3.4
2,yellow,pencil,5.6
3,red,paper,0.8
4,white,mug,2.4


In [36]:
# column option for selecting columns to show.
pd.DataFrame(data, columns=['object', 'price'])

Unnamed: 0,object,price
0,ball,1.2
1,pen,3.4
2,pencil,5.6
3,paper,0.8
4,mug,2.4


In [37]:
# if the labels are not explicitly specified within the Index array use index option.
pd.DataFrame(data, index=['one', 'two', 'three', 'four', 'five'])

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,3.4
three,yellow,pencil,5.6
four,red,paper,0.8
five,white,mug,2.4


In [38]:
# using np.arange with index and column.
frame1 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['red', 'blue', 'yellow', 'white'],
                     columns=['ball', 'pen', 'pencil', 'paper'])
frame1

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [39]:
frame.columns

Index(['color', 'object', 'price'], dtype='object')

In [40]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [41]:
frame.values

array([['blue', 'ball', 1.2],
       ['green', 'pen', 3.4],
       ['yellow', 'pencil', 5.6],
       ['red', 'paper', 0.8],
       ['white', 'mug', 2.4]], dtype=object)

###**Selecting Elements**

In [42]:

data['price']

[1.2, 3.4, 5.6, 0.8, 2.4]

In [43]:
frame[0:2]

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,3.4


In [44]:
frame[1:3]

Unnamed: 0,color,object,price
1,green,pen,3.4
2,yellow,pencil,5.6


In [45]:
data['object'] [3]

'paper'

###**Assigning Values**

In [46]:
# name attribute for assigning a label.
frame.index.name = 'id'; frame.columns.name = 'item'
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,3.4
2,yellow,pencil,5.6
3,red,paper,0.8
4,white,mug,2.4


In [47]:
frame['new'] = 12
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,3.4,12
2,yellow,pencil,5.6,12
3,red,paper,0.8,12
4,white,mug,2.4,12


In [48]:
frame['new'] = np.random.randint(0, 12)
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,10
1,green,pen,3.4,10
2,yellow,pencil,5.6,10
3,red,paper,0.8,10
4,white,mug,2.4,10


In [49]:
frame['new'] = [3.9, 12.3, 9.9, 65.4, 5.5]
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,3.9
1,green,pen,3.4,12.3
2,yellow,pencil,5.6,9.9
3,red,paper,0.8,65.4
4,white,mug,2.4,5.5


In [50]:
ser = pd.Series(np.arange(5))
ser

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [51]:
frame['new'] = ser
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,3.4,1
2,yellow,pencil,5.6,2
3,red,paper,0.8,3
4,white,mug,2.4,4


In [52]:
frame['price'] [2]

5.6

In [53]:
frame['price'] [2] = 3.3
frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,3.4,1
2,yellow,pencil,3.3,2
3,red,paper,0.8,3
4,white,mug,2.4,4


In [54]:
# membership of a value.
frame.isin([1.0, 'pen'])

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,False,True,False,True
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [55]:
frame[frame.isin([1.0, 'pen'])]

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,pen,,1.0
2,,,,
3,,,,
4,,,,


In [56]:
# deleting a column
del frame['new']

In [57]:
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,3.4
2,yellow,pencil,3.3
3,red,paper,0.8
4,white,mug,2.4


In [58]:
# filtering 
frame1[frame1 < 10]

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,2.0,3.0
blue,4.0,5.0,6.0,7.0
yellow,8.0,9.0,,
white,,,,


In [59]:
frame1[frame1 < 12]

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,2.0,3.0
blue,4.0,5.0,6.0,7.0
yellow,8.0,9.0,10.0,11.0
white,,,,


In [60]:
frame1[frame1 > 5]

Unnamed: 0,ball,pen,pencil,paper
red,,,,
blue,,,6.0,7.0
yellow,8.0,9.0,10.0,11.0
white,12.0,13.0,14.0,15.0


In [61]:
frame1[frame1 >= 5]

Unnamed: 0,ball,pen,pencil,paper
red,,,,
blue,,5.0,6.0,7.0
yellow,8.0,9.0,10.0,11.0
white,12.0,13.0,14.0,15.0


In [62]:
frame1[frame1 != 5]

Unnamed: 0,ball,pen,pencil,paper
red,0,1.0,2,3
blue,4,,6,7
yellow,8,9.0,10,11
white,12,13.0,14,15


###**1.Reindexing 2.dropping 3. alignment**

In [63]:
# reindexing
ser = pd.Series([2, 3, 5, 7, 4], index=['one', 'two', 'three', 'four', 'five'])
ser

one      2
two      3
three    5
four     7
five     4
dtype: int64

In [64]:
ser.reindex(['three','four','five','one' ])

three    5
four     7
five     4
one      2
dtype: int64

In [65]:
ser1 = pd.Series([1,5,3,6], index=[0, 3, 5, 6])
ser1

0    1
3    5
5    3
6    6
dtype: int64

In [66]:
# reindexing with method option.
ser1.reindex(range(6), method='ffill')

0    1
1    1
2    1
3    5
4    5
5    3
dtype: int64

In [67]:
# If you want this index value to be assigned during the interpolation, you have to use the bfill method.
ser1.reindex(range(6), method='bfill')

0    1
1    5
2    5
3    5
4    3
5    3
dtype: int64

##**Dropping**

In [68]:
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [69]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [70]:
ser.drop(['blue', 'white'])

red       0.0
yellow    2.0
dtype: float64

In [71]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [72]:
frame.drop(['blue', 'yellow'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
white,12,13,14,15


In [73]:
# to delete column specify axis.
frame.drop(['pen', 'pencil'], axis=1)

Unnamed: 0,ball,paper
red,0,3
blue,4,7
yellow,8,11
white,12,15


###**Functions by Element**

In [74]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index = ['red', 'yellow', 'brown', 'black'],
                    columns = ['ball', 'pen', 'paper', 'pencil'])
frame

Unnamed: 0,ball,pen,paper,pencil
red,0,1,2,3
yellow,4,5,6,7
brown,8,9,10,11
black,12,13,14,15


In [75]:
f = lambda x: x.max() - x.min()

In [76]:
# it is possible to define the function also in this way.
def f(x):
  return x.max() - x.min()

# Using the apply() function you can apply the function just defined on the DataFrame.

frame.apply(f)

ball      12
pen       12
paper     12
pencil    12
dtype: int64

In [77]:
# t if you prefer to apply the function by row instead of by column, you have to specify the axis option set to 1.

frame.apply(f, axis=1)

red       3
yellow    3
brown     3
black     3
dtype: int64

In [78]:
# apply() can returns a scalar value.
def f(x):
  return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,ball,pen,paper,pencil
min,0,1,2,3
max,12,13,14,15


In [79]:
# Statistics Functions
frame.sum()

ball      24
pen       28
paper     32
pencil    36
dtype: int64

In [80]:
frame.mean()

ball      6.0
pen       7.0
paper     8.0
pencil    9.0
dtype: float64

In [81]:
frame.max()

ball      12
pen       13
paper     14
pencil    15
dtype: int64

In [82]:
frame.std()

ball      5.163978
pen       5.163978
paper     5.163978
pencil    5.163978
dtype: float64

In [83]:
frame.min()

ball      0
pen       1
paper     2
pencil    3
dtype: int64

In [84]:
# describe() to obtain a summary.
frame.describe()

Unnamed: 0,ball,pen,paper,pencil
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


###**Sorting and Ranking**

In [85]:
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [87]:
ser.sort_index()

blue      0
green     4
red       5
white     8
yellow    3
dtype: int64

In [88]:
ser.sort_index(ascending=False)

yellow    3
white     8
red       5
green     4
blue      0
dtype: int64

In [89]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [90]:
frame.sort_index()

Unnamed: 0,ball,pen,pencil,paper
blue,4,5,6,7
red,0,1,2,3
white,12,13,14,15
yellow,8,9,10,11


In [91]:
frame.sort_index(axis=1)

Unnamed: 0,ball,paper,pen,pencil
red,0,3,1,2
blue,4,7,5,6
yellow,8,11,9,10
white,12,15,13,14


In [99]:
# The ranking is an operation closely related to sorting. The rank will be assigned starting from the lowest value to the highest value.
ser.rank()

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [100]:
ser.rank(method='first')


red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [101]:
# By default, even the ranking follows an ascending sort. To reverse this criterion, set the ascending option to False.
ser.rank(ascending=False)

red       2.0
blue      5.0
yellow    4.0
white     1.0
green     3.0
dtype: float64

###**Correlation and Covariance***

**Two important statistical calculations are correlation and covariance, expressed in pandas by the corr() and
cov() functions**

In [105]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
                   index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])
frame2

Unnamed: 0,ball,pen,pencil,paper
red,1,4,3,6
blue,4,5,6,1
yellow,3,3,1,5
white,4,1,6,4


In [106]:
frame2.corr()

Unnamed: 0,ball,pen,pencil,paper
ball,1.0,-0.276026,0.57735,-0.763763
pen,-0.276026,1.0,-0.079682,-0.361403
pencil,0.57735,-0.079682,1.0,-0.692935
paper,-0.763763,-0.361403,-0.692935,1.0


In [107]:
frame2.cov()

Unnamed: 0,ball,pen,pencil,paper
ball,2.0,-0.666667,2.0,-2.333333
pen,-0.666667,2.916667,-0.333333,-1.333333
pencil,2.0,-0.333333,6.0,-3.666667
paper,-2.333333,-1.333333,-3.666667,4.666667


Using the method corrwith(), you can calculate the pairwise correlations between the columns or rows
of a data frame with a Series or another DataFrame().

In [109]:
 frame2.corrwith(ser)

ball     -0.140028
pen      -0.869657
pencil    0.080845
paper     0.595854
dtype: float64

In [110]:
frame2.corrwith(frame)

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64

##**“Not a Number” Data**

In [112]:
# assigning a NaN Value using np.NaN.
ser = pd.Series([0, 1, 2, np.NaN, 9], index=['red', 'blue', 'purple', 'brown', 'black'])
ser

red       0.0
blue      1.0
purple    2.0
brown     NaN
black     9.0
dtype: float64

In [114]:
ser['brown'] = None
ser

red       0.0
blue      1.0
purple    2.0
brown     NaN
black     9.0
dtype: float64

In [115]:
# Filtering Out NaN Values.
# dropna() drop the NaN row.
ser.dropna()

red       0.0
blue      1.0
purple    2.0
black     9.0
dtype: float64

In [116]:
# notnull() for selecting not null values.
ser[ser.notnull()]

red       0.0
blue      1.0
purple    2.0
black     9.0
dtype: float64

In [121]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
index = ['blue','green','red'],
columns = ['ball','mug','pen'])
frame3

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [123]:
frame3.dropna()

Unnamed: 0,ball,mug,pen


dropna() disappear entire row and columns to avoid this use "how" option.

In [124]:
frame3.dropna(how='all') 

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
red,2.0,,5.0


***Filling in NaN Occurrences***

In [125]:
frame3.fillna(0)

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,0.0,0.0,0.0
red,2.0,0.0,5.0


In [126]:
frame3.fillna({'ball': 1, 'mug': 0, 'pen': 99})

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,1.0,0.0,99.0
red,2.0,0.0,5.0


##**Reading Data in CSV or Text Files**

In [131]:
import pandas as pd
csvframe = pd.read_csv('a.csv')
csvframe

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
0,2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,733258,ANZSIC06 divisions A-S (excluding classes K633...
1,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,660630,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,54342,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,654872,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
19812,2016,Level 3,CC71,Primary Metal and Metal Product Manufacturing,Dollars,H34,Total income per employee count,Financial ratios,765400,"ANZSIC06 groups C211, C212, C213, and C214"
19813,2016,Level 3,CC71,Primary Metal and Metal Product Manufacturing,Dollars,H35,Surplus per employee count,Financial ratios,-66700,"ANZSIC06 groups C211, C212, C213, and C214"
19814,2016,Level 3,CC71,Primary Metal and Metal Product Manufacturing,Percentage,H36,Current ratio,Financial ratios,83,"ANZSIC06 groups C211, C212, C213, and C214"
19815,2016,Level 3,CC71,Primary Metal and Metal Product Manufacturing,Percentage,H37,Quick ratio,Financial ratios,50,"ANZSIC06 groups C211, C212, C213, and C214"


In [133]:
# reading csv file using read_table().
from pandas.io.parsers import read_table
read_table('a.csv', sep=',')

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
0,2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,733258,ANZSIC06 divisions A-S (excluding classes K633...
1,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,660630,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,54342,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,654872,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
37075,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H37,Quick ratio,Financial ratios,52,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37076,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H38,Margin on sales of goods for resale,Financial ratios,40,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37077,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H39,Return on equity,Financial ratios,12,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37078,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H40,Return on total assets,Financial ratios,5,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."


In [136]:
pd.read_csv('a.csv', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
1,2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,733258,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,660630,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,54342,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
37076,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H37,Quick ratio,Financial ratios,52,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37077,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H38,Margin on sales of goods for resale,Financial ratios,40,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37078,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H39,Return on equity,Financial ratios,12,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37079,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H40,Return on total assets,Financial ratios,5,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."


In [138]:
# change the column name
pd.read_csv('a.csv', names=['red', 'white', 'blue', 'black', 'purple', 'pink', 'yellow', 'brown', 'green', 'orange'])

Unnamed: 0,red,white,blue,black,purple,pink,yellow,brown,green,orange
0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
1,2020,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,733258,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,660630,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,54342,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
37076,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H37,Quick ratio,Financial ratios,52,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37077,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H38,Margin on sales of goods for resale,Financial ratios,40,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37078,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H39,Return on equity,Financial ratios,12,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37079,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H40,Return on total assets,Financial ratios,5,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."


In [142]:
pd.read_csv('a.csv', sep=',', skiprows=[0, 1, 3, 6])

Unnamed: 0,2020,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,"660,630","ANZSIC06 divisions A-S (excluding classes K6330, L6711, O7552, O760, O771, O772, S9540, S9601, S9602, and S9603)"
0,2020,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,18285,ANZSIC06 divisions A-S (excluding classes K633...
1,2020,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,654872,ANZSIC06 divisions A-S (excluding classes K633...
2,2020,Level 1,99999,All industries,Dollars (millions),H10,Indirect taxes,Financial performance,7509,ANZSIC06 divisions A-S (excluding classes K633...
3,2020,Level 1,99999,All industries,Dollars (millions),H11,Depreciation,Financial performance,26821,ANZSIC06 divisions A-S (excluding classes K633...
4,2020,Level 1,99999,All industries,Dollars (millions),H12,Salaries and wages paid,Financial performance,119387,ANZSIC06 divisions A-S (excluding classes K633...
...,...,...,...,...,...,...,...,...,...,...
37071,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H37,Quick ratio,Financial ratios,52,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37072,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H38,Margin on sales of goods for resale,Financial ratios,40,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37073,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H39,Return on equity,Financial ratios,12,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
37074,2013,Level 3,ZZ11,Food product manufacturing,Percentage,H40,Return on total assets,Financial ratios,5,"ANZSIC06 groups C111, C112, C113, C114, C115, ..."
