In [40]:
import pandas as pd

# Data Structures in Pandas

Pandas make use of two different data structures:
1. Series -> represents data in 1D form
2. Data Frame -> represents data in 2D tabular form

## Pandas Series (1D arrays with labels)

In [8]:
#obtain series from Python dictionaries (data structure for storing key-value pairs)
#keys of dictionaries act as index or Label for values
dict={'a' : 3, 'b' : 'cat ' , 'c ' : 2.5}
pd.Series(dict)

a        3
b     cat 
c      2.5
dtype: object

In [7]:
oneD = pd.Series([100, 'cat' , 310, 'gog' , 500], index=['Amy', 'Bobby', 'Cat', 'Don ', ' Emma' ]) 
#the second list contains index
#index is the label of values
oneD

Amy      100
Bobby    cat
Cat      310
Don      gog
 Emma    500
dtype: object

In [9]:
#extract value corresponding to the label
oneD.loc[[ 'Cat', ' Emma' ]] #loc is a label-Location based indexer for selection by Labels- Cat and Donna
#loc brings out values corresponding to the given index

Cat      310
 Emma    500
dtype: object

In [11]:
#extract both, label and data
oneD[[0,3,4]] #extract the data at index 0, 3 and 4

Amy      100
Don      gog
 Emma    500
dtype: object

In [13]:
#access the value (not label) using its index number
oneD.iloc[1] #.iloc is primarily integer position based (from 0 to Length-1 of the axis). 
#access index 1, the value there is cat and label is Bobby

'cat'

In [14]:
#check if there is a cat in the series index
'cat' in oneD 
#False because cat is in value and not index/label

False

In [15]:
'Cat' in oneD

True

## Pandas DataFrames (2D data structure) 
Stores data in tabular form (rows and columns)

<class 'pandas.core.frame.DataFrame'>

In [27]:
d = {'A' : pd.Series([100. , 200. , 300.], index=[ 'apple' , 'pear', 'orange' ]),
'B' : pd.Series([111., 222., 333., 4444.], index=['apple', 'pear', 'orange', 'melon'])}

print(d)

{'A': apple     100.0
pear      200.0
orange    300.0
dtype: float64, 'B': apple      111.0
pear       222.0
orange     333.0
melon     4444.0
dtype: float64}


In [28]:
df = pd.DataFrame(d)
print(df) #when there are no values corresponding to an index then we have NaN

            A       B
apple   100.0   111.0
melon     NaN  4444.0
orange  300.0   333.0
pear    200.0   222.0


In [20]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [26]:
df.index

Index(['apple', 'melon', 'orange', 'pear'], dtype='object')

In [29]:
df.columns

Index(['A', 'B'], dtype='object')

In [30]:
pd.DataFrame(df, index=['orange', 'melon' , 'apple' ], columns=[ 'A' ]) #specify which row/index and column we want to retain

Unnamed: 0,A
orange,300.0
melon,
apple,100.0


# Read in CSV Files

In [32]:
#read in a very simple CSV file

file="C:\\Users\\Seemab\\Desktop\\Udemy\\Practical Neural Networks and Deep Learning in Python\\Data\\section2\\Resp2.csv"
df=pd.read_csv(file)
df.head(5)

Unnamed: 0,experience,respiration
0,0,3.94
1,0,4.26
2,0,4.16
3,0,3.76
4,0,4.07


In [34]:
#read in a CSV file with seperators

file="C:\\Users\\Seemab\\Desktop\\Udemy\\Practical Neural Networks and Deep Learning in Python\\Data\\section2\\winequality-red.csv"
df=pd.read_csv(file, sep=";") #default is a comma (,)
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [35]:
#read in a .txt file

file="C:\\Users\\Seemab\\Desktop\\Udemy\\Practical Neural Networks and Deep Learning in Python\\Data\\section2\\bostonTxt.txt"
df=pd.read_csv(file, sep="\t")
df.head(5)

Unnamed: 0,MV,INDUS,NOX,RM,TAX,PT,LSTAT
0,24.0,2.31,53.8,6.575,296,15.3,4.98
1,21.6,7.07,46.9,6.421,242,17.8,9.14
2,34.7,7.07,46.9,7.185,242,17.8,4.03
3,33.4,2.18,45.8,6.998,222,18.7,2.94
4,36.2,2.18,45.8,7.147,222,18.7,5.33


# Read in Excel Files
There can be more than 1 sheet in Excel files

In [37]:
import os

In [38]:
#to know which schema of slashes works for us
os.getcwd() 

'C:\\Users\\Seemab\\Desktop\\Udemy\\Practical Neural Networks and Deep Learning in Python'

In [43]:
file="C:\\Users\\Seemab\\Desktop\\Udemy\\Practical Neural Networks and Deep Learning in Python\\Data\\section2\\boston1.xls"

#load spreadsheet
xl=pd.ExcelFile(file)

#print sheet names
print(xl.sheet_names) 

['Sheet1', 'Sheet2']


In [44]:
#load sheet into DataFrame 
df=xl.parse('Sheet1')
df.head()

Unnamed: 0,MV,INDUS,NOX,RM,TAX,PT,LSTAT,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,24.0,2.31,53.8,6.575,296,15.3,4.98,,,Subset of Boston housing tract
1,21.6,7.07,46.9,6.421,242,17.8,9.14,,,data of Harrison and Rubinfeld
2,34.7,7.07,46.9,7.185,242,17.8,4.03,,,(1978). Each case is one U.S.
3,33.4,2.18,45.8,6.998,222,18.7,2.94,,,Census tract in the Boston area.
4,36.2,2.18,45.8,7.147,222,18.7,5.33,,,
