# Introduction to pandas Data Structures
To get started with pandas, you will need to get comfortable with its two workhorse
data structures:<b> Series and DataFrame</b>. While they are not a universal solution for
every problem, they provide a solid, easy-to-use basis for most applications.

# The pandas DataFrame Object

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string,
boolean, etc.). 

<h4>Creating a DataFrame from scratch</h4>

In [5]:
# create a DataFrame from a 2-d ndarray

df = pd.DataFrame(np.array([[10, 11, 12, 13], [20, 21, 22, 23]]))
df

# default row and columns indexes

Unnamed: 0,0,1,2,3
0,10,11,12,13
1,20,21,22,23


In [8]:
# create a DataFrame from a 2-d ndarray

df = pd.DataFrame(np.array([[10, 11, 12, 13,45], [20, 21, 22, 23]]))
df

# default row and columns indexes

Unnamed: 0,0
0,"[10, 11, 12, 13, 45]"
1,"[20, 21, 22, 23]"


In [6]:
# create a DataFrame for a list of Series objects

df1 = pd.DataFrame([pd.Series(np.arange(10, 15)),
                    pd.Series(np.arange(15, 20))])
df1
# default row and columns indexes

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [7]:
# create a DataFrame for a list of Series objects

df1 = pd.DataFrame([pd.Series(np.arange(10, 16)),
                    pd.Series(np.arange(15, 20))])
df1
# default row and columns indexes

Unnamed: 0,0,1,2,3,4,5
0,10.0,11.0,12.0,13.0,14.0,15.0
1,15.0,16.0,17.0,18.0,19.0,


In [9]:
# create a DataFrame with two Series objects
# and a dictionary
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
# Column name using dictionary
df2= pd.DataFrame({'boys': s1, 'girls': s2})
df2

Unnamed: 0,boys,girls
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [10]:
# specify column names
df3 = pd.DataFrame(np.array([[10, 11], [20, 21]]),columns=['apples', 'oranges'])
df3

Unnamed: 0,apples,oranges
0,10,11
1,20,21


In [12]:
# create a DataFrame with named columns and rows

df4 = pd.DataFrame(np.array([[10, 11, 12, 13], [20, 21, 22, 23], [20, 21, 22, 23]]),
                  
                                            index=['apples', 'oranges','bananas'],
                                            columns=['Mon', 'Tue','Wed', 'Thu'])
df4

Unnamed: 0,Mon,Tue,Wed,Thu
apples,10,11,12,13
oranges,20,21,22,23
bananas,20,21,22,23


In [13]:
# demonstrate alignment during creation
s3 = pd.Series(np.arange(12, 14), index=[1, 2])
df5 = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})
df5

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,


In [None]:
# Examples of creating data frames

In [14]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [33]:
pd.DataFrame(frame, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [22]:
frame2 = pd.DataFrame(frame, columns=['year', 'state', 'pop','new'])

In [30]:
frame2.drop(['new'],axis=1)

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [37]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [36]:
frame['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [39]:
# If you pass a column that isn’t contained in the dict(debt), it will appear with missing values
# in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                              index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [42]:
frame2['imports'] = [2,3,4,5,6,7]
frame2

Unnamed: 0,year,state,pop,debt,imports
one,2000,Ohio,1.5,,2
two,2001,Ohio,1.7,,3
three,2002,Ohio,3.6,,4
four,2001,Nevada,2.4,,5
five,2002,Nevada,2.9,,6
six,2003,Nevada,3.2,,7


In [43]:
frame2.debt = "100"
frame2

Unnamed: 0,year,state,pop,debt,imports
one,2000,Ohio,1.5,100,2
two,2001,Ohio,1.7,100,3
three,2002,Ohio,3.6,100,4
four,2001,Nevada,2.4,100,5
five,2002,Nevada,2.9,100,6
six,2003,Nevada,3.2,100,7


In [44]:
frame2['debt']=np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt,imports
one,2000,Ohio,1.5,0,2
two,2001,Ohio,1.7,1,3
three,2002,Ohio,3.6,2,4
four,2001,Nevada,2.4,3,5
five,2002,Nevada,2.9,4,6
six,2003,Nevada,3.2,5,7


In [45]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [46]:
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt,imports
one,2000,Ohio,1.5,,2
two,2001,Ohio,1.7,-1.2,3
three,2002,Ohio,3.6,,4
four,2001,Nevada,2.4,-1.5,5
five,2002,Nevada,2.9,-1.7,6
six,2003,Nevada,3.2,,7


In [47]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,imports,eastern
one,2000,Ohio,1.5,,2,True
two,2001,Ohio,1.7,-1.2,3,True
three,2002,Ohio,3.6,,4,True
four,2001,Nevada,2.4,-1.5,5,False
five,2002,Nevada,2.9,-1.7,6,False
six,2003,Nevada,3.2,,7,False


In [48]:
frame2['greaterThan2']= frame2['pop'] > 2
frame2

Unnamed: 0,year,state,pop,debt,imports,eastern,greaterThan2
one,2000,Ohio,1.5,,2,True,False
two,2001,Ohio,1.7,-1.2,3,True,False
three,2002,Ohio,3.6,,4,True,True
four,2001,Nevada,2.4,-1.5,5,False,True
five,2002,Nevada,2.9,-1.7,6,False,True
six,2003,Nevada,3.2,,7,False,True


In [49]:
del frame2['eastern']

In [50]:
frame2

Unnamed: 0,year,state,pop,debt,imports,greaterThan2
one,2000,Ohio,1.5,,2,False
two,2001,Ohio,1.7,-1.2,3,False
three,2002,Ohio,3.6,,4,True
four,2001,Nevada,2.4,-1.5,5,True
five,2002,Nevada,2.9,-1.7,6,True
six,2003,Nevada,3.2,,7,True


In [51]:
#Another common form of data is a nested dict of dicts:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio':   {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 =pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


<b>If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys
as the columns and the inner keys as the row indices</b>

In [52]:
df3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [53]:
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [54]:
pop1 =pd.DataFrame(pop, index=[2001, 2002, 2003])
pop1

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [55]:
pdata = {'Ohio': df3['Ohio'][:-1],
        'Nevada': df3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [56]:
df3.index.name = 'year'; df3.columns.name = 'state'

In [57]:
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


# Correlation

In [60]:
data = pd.read_csv('test.csv')
data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [62]:
data.corr() #finding corelation

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.026751,-0.034102,0.003818,0.04308,0.008211
Pclass,-0.026751,1.0,-0.492143,0.001087,0.018721,-0.577147
Age,-0.034102,-0.492143,1.0,-0.091587,-0.061249,0.337932
SibSp,0.003818,0.001087,-0.091587,1.0,0.306895,0.171539
Parch,0.04308,0.018721,-0.061249,0.306895,1.0,0.230046
Fare,0.008211,-0.577147,0.337932,0.171539,0.230046,1.0


In [63]:
data.Pclass.corr(data.Fare)

-0.5771473123362404