In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('gapminder.tsv', sep = '\t')

### .head() 
- It will simply give the first five rows of the dataset

df.head()  

In [4]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [5]:
df.index

RangeIndex(start=0, stop=1704, step=1)

In [6]:
df.values

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.331999999999997, 9240934,
        820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.1007099999999],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499602999999],
       ['Zimbabwe', 'Africa', 2002, 39.989000000000004, 11926563,
        672.0386227000001],
       ['Zimbabwe', 'Africa', 2007, 43.486999999999995, 12311143,
        469.70929810000007]], dtype=object)

In [7]:
type(df)

pandas.core.frame.DataFrame

### Calling shape attribute

In [10]:
df.shape

(1704, 6)

### calling a shape function rather than attribute

In [11]:
df.shape()

TypeError: 'tuple' object is not callable

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


### want a single column of the dataset

In [13]:
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [17]:
country_df = df['country']

In [19]:
country_df.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [20]:
type(country_df)

pandas.core.series.Series

### Trying to call out multiple columns at the same time from the dataset
- put the column name in list

In [21]:
subset = df[['country','continent','year']]

In [22]:
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


### basically .loc[ ] matches the given parameter in the dataset
- if you have duplicate rows with index .loc[2], you can get the both rows rather than a single row

In [28]:
df.loc[2]

country      Afghanistan
continent           Asia
year                1962
lifeExp           31.997
pop             10267083
gdpPercap        853.101
Name: 2, dtype: object

- Inorder to get the multiple row, we have to give the row name in a list

In [31]:
df.loc[[2,3,4]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [39]:
df.loc[:,['year', 'continent']]

Unnamed: 0,year,continent
0,1952,Asia
1,1957,Asia
2,1962,Asia
3,1967,Asia
4,1972,Asia
...,...,...
1699,1987,Africa
1700,1992,Africa
1701,1997,Africa
1702,2002,Africa


In [41]:
df.loc[df['year']== 1967, ['year','pop']]

Unnamed: 0,year,pop
3,1967,11537966
15,1967,1984060
27,1967,12760499
39,1967,5247469
51,1967,22934225
...,...,...
1647,1967,39463910
1659,1967,1142636
1671,1967,6740785
1683,1967,3900000


In [43]:
df.loc[(df['year']== 1967) &(df['pop']>1_00_000), ['year','pop']]

Unnamed: 0,year,pop
3,1967,11537966
15,1967,1984060
27,1967,12760499
39,1967,5247469
51,1967,22934225
...,...,...
1647,1967,39463910
1659,1967,1142636
1671,1967,6740785
1683,1967,3900000


### .ilco[ ]
- This will simply give the rows and the columns of specific index of the dataset

In [33]:
df.iloc[3]

country      Afghanistan
continent           Asia
year                1967
lifeExp            34.02
pop             11537966
gdpPercap        836.197
Name: 3, dtype: object

- How you specify the row will be in left of comma.
- How you specify the columns will be in the right of comma.
- [:,:] indicate all rows and all columns

In [35]:
df.iloc[:,0]

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object