# How do I create a pandas DataFrame from another object?

In [8]:
import pandas as pd

In [12]:
df = pd.DataFrame({'id':[100,101,102],'color':['red','blue','red']},columns={'id','color'},index=['a','b','c'])

In [13]:
df

Unnamed: 0,color,id
a,red,100
b,blue,101
c,red,102


In [15]:
pd.DataFrame([[100,'red'],[101,'blue'],[103,'red']],columns=['id','color'])

Unnamed: 0,id,color
0,100,red
1,101,blue
2,103,red


In [16]:
import numpy as np

In [18]:
arr = np.random.rand(4,2)
arr

array([[0.55268069, 0.1937831 ],
       [0.16789738, 0.93270996],
       [0.07283002, 0.3898988 ],
       [0.09015253, 0.57087105]])

In [20]:
pd.DataFrame(arr,columns =['one','two'])

Unnamed: 0,one,two
0,0.552681,0.193783
1,0.167897,0.93271
2,0.07283,0.389899
3,0.090153,0.570871


In [21]:
GColumns = ['student','testScore']

In [26]:
pd.DataFrame({'student':np.arange(100,110,1), 'testScore':np.random.randint(60,101,10)}).set_index('student')

Unnamed: 0_level_0,testScore
student,Unnamed: 1_level_1
100,83
101,97
102,75
103,80
104,99
105,72
106,97
107,61
108,99
109,96


In [27]:
# bonus

In [31]:
s = pd.Series(['round','square'],index=['c','b'],name = 'shape')
s

c     round
b    square
Name: shape, dtype: object

In [32]:
df

Unnamed: 0,color,id
a,red,100
b,blue,101
c,red,102


In [34]:
pd.concat([df, s],axis =1,sort = False)

Unnamed: 0,color,id,shape
a,red,100,
b,blue,101,square
c,red,102,round


# How do I apply a function to a pandas Series or DataFrame?

In [35]:
# Apply map and applymap

In [36]:
train = pd.read_csv('http://bit.ly/kaggletrain')

In [37]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
train['Sex_num'] = train.Sex.map({'female':0,'male':1})
train.loc[0:7,['Sex','Sex_num']]

Unnamed: 0,Sex,Sex_num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1
5,male,1
6,male,1
7,male,1


In [39]:
# I want to calculte the length of these strings in name column
train['Name_len'] = train.Name.apply(len)
train.loc[0:4,['Name','Name_len']]

Unnamed: 0,Name,Name_len
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


In [42]:
import numpy as np
train['Fare_ceil'] = train.Fare.apply(np.ceil)
train.loc[0:4,['Fare','Fare_ceil']]

Unnamed: 0,Fare,Fare_ceil
0,7.25,8.0
1,71.2833,72.0
2,7.925,8.0
3,53.1,54.0
4,8.05,9.0


In [43]:
# lets solve the harder problem
# let's extract the last name of the column Name
train.Name.str.split(',').head()

0                           [Braund,  Mr. Owen Harris]
1    [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                            [Heikkinen,  Miss. Laina]
3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                          [Allen,  Mr. William Henry]
Name: Name, dtype: object

In [44]:
def get_element(my_List,position):
    return my_List[position]

In [46]:
train.Name.str.split(',').apply(get_element,position=0).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

In [47]:
train.Name.str.split(',').apply(lambda x:x[0]).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

In [48]:
# apply as a DataFrame method

In [49]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [55]:
drinks.loc[:,'beer_servings':'wine_servings'].apply(np.argmax,axis=1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


0        beer_servings
1      spirit_servings
2        beer_servings
3        wine_servings
4        beer_servings
5      spirit_servings
6        wine_servings
7      spirit_servings
8        beer_servings
9        beer_servings
10     spirit_servings
11     spirit_servings
12     spirit_servings
13       beer_servings
14     spirit_servings
15     spirit_servings
16       beer_servings
17       beer_servings
18       beer_servings
19       beer_servings
20       beer_servings
21     spirit_servings
22       beer_servings
23       beer_servings
24       beer_servings
25     spirit_servings
26       beer_servings
27       beer_servings
28       beer_servings
29       beer_servings
            ...       
163    spirit_servings
164      beer_servings
165      wine_servings
166      wine_servings
167    spirit_servings
168    spirit_servings
169    spirit_servings
170      beer_servings
171      wine_servings
172      beer_servings
173      beer_servings
174      beer_servings
175      be

# applymap

In [59]:
drinks.loc[:,'beer_servings':'wine_servings'] = drinks.loc[:,'beer_servings':'wine_servings'].applymap(float)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0.0,0.0,Asia
1,Albania,89.0,132.0,54.0,4.9,Europe
2,Algeria,25.0,0.0,14.0,0.7,Africa
3,Andorra,245.0,138.0,312.0,12.4,Europe
4,Angola,217.0,57.0,45.0,5.9,Africa
