# Pandas Series
one-dimensional labelled array capable of holding data of any type (integer, string, float, python objects, etc.). The axis labels are collectively called indexes. Pandas Series is nothing but a column in an excel sheet. Labels need not be unique but must be a hashable type. The object supports both integer and label-based indexing and provides a host of methods for performing operations involving the index.

## Creating a Series

In [7]:
import pandas as pd
import numpy as np


# Creating empty series
ser = pd.Series()
print(ser)

# simple array
data = np.array(['rita', 'girwar', 'priyanshi', 'pragya', 'vijaya'])

ser = pd.Series(data)
print(ser)


Series([], dtype: float64)
0         rita
1       girwar
2    priyanshi
3       pragya
4       vijaya
dtype: object


# Pandas DataFrame
two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the data, rows, and columns

## Creating a DataFrame 

In [5]:
import pandas as pd

# Calling DataFrame constructor
df = pd.DataFrame()
print(df)

# list of strings
lst = ['rita', 'girwar', 'priyanshi', 'pragya', 'vijaya']

# Calling DataFrame constructor on list
df = pd.DataFrame(lst)
df


Empty DataFrame
Columns: []
Index: []


Unnamed: 0,0
0,rita
1,girwar
2,priyanshi
3,pragya
4,vijaya


### Creating DataFrame from dict of ndarray/lists

In [4]:
import pandas as pd

# intialise data of lists.
data = {'Name':['rita', 'girwar', 'priyanshi', 'pragya', 'vijaya'],
        'Age':[24, 24, 23, 24,25]}
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Print the output.
df

Unnamed: 0,Name,Age
0,rita,24
1,girwar,24
2,priyanshi,23
3,pragya,24
4,vijaya,25


### Column Selection

In [9]:
import pandas as pd
 
# Define a dictionary containing employee data
data = {'Name':['rita', 'girwar', 'priyanshi', 'pragya', 'vijaya'],
        'Age':[24, 24, 23, 24,25],
        'Address':['Bhopal', 'Jodhpur', 'Bareili', 'Vidisha','Kolkata']}
 
# Convert the dictionary into DataFrame 
df = pd.DataFrame(data)
 
# select two columns
df[['Name', 'Address']]

Unnamed: 0,Name,Address
0,rita,Bhopal
1,girwar,Jodhpur
2,priyanshi,Bareili
3,pragya,Vidisha
4,vijaya,Kolkata


### Row Selection

In [41]:
# importing pandas package
import pandas as pd
 
# making data frame from csv file
df = pd.read_csv("Automobile_data.csv", index_col='company')
df

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23,12940.0


In [45]:
# retrieving row by loc method
print('Rows having company name : volvo ')
first = df.loc['volvo']
first

Rows having company name : volvo 


Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
volvo,sedan,104.3,188.8,ohc,four,114,23,12940.0
volvo,wagon,104.3,188.8,ohc,four,114,23,13415.0


In [46]:
print('Rows having company name : audi ')
second = df.loc['audi']
second

Rows having company name : audi 


Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18,17450.0
audi,sedan,99.8,177.3,ohc,five,110,19,15250.0
audi,wagon,105.8,192.7,ohc,five,110,19,18920.0


### Indexing/Subset Selection and Selecting Data

Using df[]

In [66]:
import pandas as pd
 
# making data frame from csv file
df = pd.read_csv("Automobile_data.csv")

# retrieving columns by indexing operator
comp_col = df['company']
comp_col

0     alfa-romero
1     alfa-romero
2     alfa-romero
3            audi
4            audi
         ...     
56     volkswagen
57     volkswagen
58     volkswagen
59          volvo
60          volvo
Name: company, Length: 61, dtype: object

Using df.loc[]

In [56]:
import pandas as pd
 
# making data frame from csv file
df = pd.read_csv("Automobile_data.csv",index_col='company')

# retrieving row by loc method
print('Rows having company name : volvo ')
first = df.loc['volvo']
first

Rows having company name : volvo 


Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
volvo,sedan,104.3,188.8,ohc,four,114,23,12940.0
volvo,wagon,104.3,188.8,ohc,four,114,23,13415.0


Using df.iloc[<position>]

In [65]:
import pandas as pd
 
# making data frame from csv file
df = pd.read_csv("Automobile_data.csv",index_col='company')

# retrieving rows by iloc method 
first = df.iloc[0,7] #value at 1 row and 9 column
first

13495.0

### Accessing dataframe with a boolean index

In [74]:
# importing pandas as pd
import pandas as pd

# dictionary of lists
data = {'Name':['rita', 'girwar', 'priyanshi', 'pragya', 'vijaya'],
        'Age':[24, 24, 23, 24,25],
        'Address':['Bhopal', 'Jodhpur', 'Bareili', 'Vidisha','Kolkata']}

df = pd.DataFrame(data, index = [True, False, True, False,True])

print(df)

            Name  Age  Address
True        rita   24   Bhopal
False     girwar   24  Jodhpur
True   priyanshi   23  Bareili
False     pragya   24  Vidisha
True      vijaya   25  Kolkata


#### Accessing a Dataframe with a boolean index using .loc[]

In [76]:
df_true = df.loc[True]
df_true

Unnamed: 0,Name,Age,Address
True,rita,24,Bhopal
True,priyanshi,23,Bareili
True,vijaya,25,Kolkata


#### Accessing a Dataframe with a boolean index using .iloc[]

In [79]:
df_false = df.iloc[1]
df_false

Name        girwar
Age             24
Address    Jodhpur
dtype: object

#### Accessing a Dataframe with a boolean index using .ix[] (deprecated)

In [81]:
df_= df.ix[True]
df_

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Name,Age,Address
True,rita,24,Bhopal
True,priyanshi,23,Bareili
True,vijaya,25,Kolkata


In [82]:
df__ = df.ix[1]
df__

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Name        girwar
Age             24
Address    Jodhpur
dtype: object

#### Applying a boolean mask to a dataframe

In [93]:
df = pd.DataFrame(data, index = [0, 1, 2, 3, 4])
df___ = df[[True, True, True, False, False]]
df___

Unnamed: 0,Name,Age,Address
0,rita,24,Bhopal
1,girwar,24,Jodhpur
2,priyanshi,23,Bareili


#### Masking data based on column value

In [88]:
df_age = df['Age'] == 24
df_age

0     True
1     True
2    False
3     True
4    False
Name: Age, dtype: bool

In [94]:
df_age = df['Age'] > 24
df_age

0    False
1    False
2    False
3    False
4     True
Name: Age, dtype: bool

In [98]:
df1 = df.index > 2
df[df1]

Unnamed: 0,Name,Age,Address
3,pragya,24,Vidisha
4,vijaya,25,Kolkata


## Working with Missing Data

### Checking for missing values using isnull() and notnull()

In [104]:
# importing pandas package
import pandas as pd
 
# making data frame from csv file
df = pd.read_csv("Automobile_data.csv", index_col='company')
df.isnull()

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,False,False,False,False,False,False,False,False
alfa-romero,False,False,False,False,False,False,True,False
alfa-romero,False,False,False,False,False,False,False,False
audi,False,False,False,False,False,False,False,False
audi,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
volkswagen,False,False,False,False,False,False,False,False
volkswagen,False,False,False,False,False,False,False,False
volkswagen,False,False,False,False,False,False,False,False
volvo,False,False,False,False,False,False,False,False


In [123]:
df_null= pd.isnull(df['average-mileage'])
df_null

company
alfa-romero    False
alfa-romero     True
alfa-romero    False
audi           False
audi           False
               ...  
volkswagen     False
volkswagen     False
volkswagen     False
volvo          False
volvo          False
Name: average-mileage, Length: 61, dtype: bool

In [103]:
df.notnull()

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,True,True,True,True,True,True,True,True
alfa-romero,True,True,True,True,True,True,False,True
alfa-romero,True,True,True,True,True,True,True,True
audi,True,True,True,True,True,True,True,True
audi,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...
volkswagen,True,True,True,True,True,True,True,True
volkswagen,True,True,True,True,True,True,True,True
volkswagen,True,True,True,True,True,True,True,True
volvo,True,True,True,True,True,True,True,True


In [124]:
df_not_null= pd.notnull(df['average-mileage'])
df_not_null

company
alfa-romero     True
alfa-romero    False
alfa-romero     True
audi            True
audi            True
               ...  
volkswagen      True
volkswagen      True
volkswagen      True
volvo           True
volvo           True
Name: average-mileage, Length: 61, dtype: bool

### Filling missing values using fillna(), replace() and interpolate()

In [108]:
df1 = df.fillna(0) #fill with particular value
df1

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,0.0,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [125]:
df_fill_previous_value=df.fillna(method ='pad') # fill null values with previous value
df_fill_previous_value

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [126]:
df_fill_previous_value=df.fillna(method ='bfill') # fill null values with next value
df_fill_previous_value

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,19.0,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [135]:
df['average-mileage'].fillna('No Average Mileage', inplace = True )
df

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,No Average Mileage,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23,12940.0


In [139]:
df = df.replace('No Average Mileage',np.nan) # replace No Average Mileage with null field
df

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [140]:
df.head()

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0


In [141]:
df3 = df.interpolate(method ='linear', limit_direction ='forward')
df3

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,20.0,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [138]:
df.head()

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,No Average Mileage,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18,17450.0


In [117]:
df4 = df.interpolate(method ='linear', limit_direction ='backward', limit = 1)
df4

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,20.0,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


#### Dropping missing values using dropna()

In [119]:
df.head()

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0


In [120]:
df5 = df.dropna()
df5

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
audi,sedan,99.8,177.3,ohc,five,110,19.0,15250.0
audi,wagon,105.8,192.7,ohc,five,110,19.0,18920.0
bmw,sedan,101.2,176.8,ohc,four,101,23.0,16430.0
bmw,sedan,101.2,176.8,ohc,four,101,23.0,16925.0
bmw,sedan,101.2,176.8,ohc,six,121,21.0,20970.0
bmw,sedan,103.5,189.0,ohc,six,182,16.0,30760.0


In [142]:
df6 = df.dropna(how = 'all')# drop if all values is null
df6

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [144]:
df7 = df.dropna(axis = 1) # drop column with atleast 1 null values
df7 # price, average-mileage column has been dropped

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111
alfa-romero,convertible,88.6,168.8,dohc,four,111
alfa-romero,hatchback,94.5,171.2,ohcv,six,154
audi,sedan,99.8,176.6,ohc,four,102
audi,sedan,99.4,176.6,ohc,five,115
...,...,...,...,...,...,...
volkswagen,sedan,97.3,171.7,ohc,four,85
volkswagen,sedan,97.3,171.7,ohc,four,52
volkswagen,sedan,97.3,171.7,ohc,four,100
volvo,sedan,104.3,188.8,ohc,four,114


In [147]:
df8 = df.dropna(axis = 0,how ='any') # drop row with atleast 1 null values
df8 # price, average-mileage column has been dropped

Unnamed: 0_level_0,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
audi,sedan,99.8,177.3,ohc,five,110,19.0,15250.0
audi,wagon,105.8,192.7,ohc,five,110,19.0,18920.0
bmw,sedan,101.2,176.8,ohc,four,101,23.0,16430.0
bmw,sedan,101.2,176.8,ohc,four,101,23.0,16925.0
bmw,sedan,101.2,176.8,ohc,six,121,21.0,20970.0
bmw,sedan,103.5,189.0,ohc,six,182,16.0,30760.0


In [149]:
org_len = len(df)
new_len = len(df8)
null_rows_removed = org_len - new_len
null_rows_removed

4

## Iterating over rows using iteritems(), iterrows(), itertuples()

In [154]:
#using iterrows()
for i, j in df.iterrows():
    print(i, j)
    print()

alfa-romero body-style          convertible
wheel-base                 88.6
length                    168.8
engine-type                dohc
num-of-cylinders           four
horsepower                  111
average-mileage              21
price                     13495
Name: alfa-romero, dtype: object

alfa-romero body-style          convertible
wheel-base                 88.6
length                    168.8
engine-type                dohc
num-of-cylinders           four
horsepower                  111
average-mileage             NaN
price                     16500
Name: alfa-romero, dtype: object

alfa-romero body-style          hatchback
wheel-base               94.5
length                  171.2
engine-type              ohcv
num-of-cylinders          six
horsepower                154
average-mileage            19
price                   16500
Name: alfa-romero, dtype: object

audi body-style          sedan
wheel-base           99.8
length              176.6
engine-type           ohc
n

In [171]:
df = pd.read_csv('Automobile_data.csv')

#Iterate over columns
columns = list(df)
 
for i in columns:
        print (df[i][4]) #fifth row

audi
sedan
99.4
176.6
ohc
five
115
18.0
17450.0


In [175]:
#using iteritems()

for key, value in df.iteritems():
    print(key, value)

company 0     alfa-romero
1     alfa-romero
2     alfa-romero
3            audi
4            audi
         ...     
56     volkswagen
57     volkswagen
58     volkswagen
59          volvo
60          volvo
Name: company, Length: 61, dtype: object
body-style 0     convertible
1     convertible
2       hatchback
3           sedan
4           sedan
         ...     
56          sedan
57          sedan
58          sedan
59          sedan
60          wagon
Name: body-style, Length: 61, dtype: object
wheel-base 0      88.6
1      88.6
2      94.5
3      99.8
4      99.4
      ...  
56     97.3
57     97.3
58     97.3
59    104.3
60    104.3
Name: wheel-base, Length: 61, dtype: float64
length 0     168.8
1     168.8
2     171.2
3     176.6
4     176.6
      ...  
56    171.7
57    171.7
58    171.7
59    188.8
60    188.8
Name: length, Length: 61, dtype: float64
engine-type 0     dohc
1     dohc
2     ohcv
3      ohc
4      ohc
      ... 
56     ohc
57     ohc
58     ohc
59     ohc
60     ohc

In [180]:
#using itertuples()

for i in df.itertuples():
    print(i)

Pandas(Index=0, company='alfa-romero', _2='convertible', _3=88.6, length=168.8, _5='dohc', _6='four', horsepower=111, _8=21.0, price=13495.0)
Pandas(Index=1, company='alfa-romero', _2='convertible', _3=88.6, length=168.8, _5='dohc', _6='four', horsepower=111, _8=nan, price=16500.0)
Pandas(Index=2, company='alfa-romero', _2='hatchback', _3=94.5, length=171.2, _5='ohcv', _6='six', horsepower=154, _8=19.0, price=16500.0)
Pandas(Index=3, company='audi', _2='sedan', _3=99.8, length=176.6, _5='ohc', _6='four', horsepower=102, _8=24.0, price=13950.0)
Pandas(Index=4, company='audi', _2='sedan', _3=99.4, length=176.6, _5='ohc', _6='five', horsepower=115, _8=18.0, price=17450.0)
Pandas(Index=5, company='audi', _2='sedan', _3=99.8, length=177.3, _5='ohc', _6='five', horsepower=110, _8=19.0, price=15250.0)
Pandas(Index=6, company='audi', _2='wagon', _3=105.8, length=192.7, _5='ohc', _6='five', horsepower=110, _8=19.0, price=18920.0)
Pandas(Index=7, company='bmw', _2='sedan', _3=101.2, length=176.8

## DataFrame Methods

In [231]:
# dataframe.index 

df = pd.read_csv('Automobile_data.csv')
df.index

RangeIndex(start=0, stop=61, step=1)

In [232]:
# DataFrameName.insert(loc, column, value, allow_duplicates = False)

df.insert(8,'new column','a')
df.columns


Index(['company', 'body-style', 'wheel-base', 'length', 'engine-type',
       'num-of-cylinders', 'horsepower', 'average-mileage', 'new column',
       'price'],
      dtype='object')

In [233]:
df.head(10)

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,new column,price
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,a,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,a,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,a,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,a,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,a,17450.0
5,audi,sedan,99.8,177.3,ohc,five,110,19.0,a,15250.0
6,audi,wagon,105.8,192.7,ohc,five,110,19.0,a,18920.0
7,bmw,sedan,101.2,176.8,ohc,four,101,23.0,a,16430.0
8,bmw,sedan,101.2,176.8,ohc,four,101,23.0,a,16925.0
9,bmw,sedan,101.2,176.8,ohc,six,121,21.0,a,20970.0


In [239]:
#DataFrame.add(other, axis=’columns’, level=None, fill_value=None)

# Importing Pandas as pd
import pandas as pd

# Importing numpy as np
import numpy as np

# Creating a dataframe
# Setting the seed value to re-generate the result.
np.random.seed(25)

dataframe = pd.DataFrame(np.random.rand(5, 3), columns =['A', 'B', 'C'])

# np.random.rand(5, 3) has generated a
# random 2-Dimensional array of shape 5 * 3
# which is then converted to a dataframe

dataframe

Unnamed: 0,A,B,C
0,0.870124,0.582277,0.278839
1,0.185911,0.4111,0.117376
2,0.684969,0.437611,0.556229
3,0.36708,0.402366,0.113041
4,0.447031,0.585445,0.161985


In [241]:
dataframe.iloc[-1] = np.nan #Null values at the end
dataframe

Unnamed: 0,A,B,C
0,0.870124,0.582277,0.278839
1,0.185911,0.4111,0.117376
2,0.684969,0.437611,0.556229
3,0.36708,0.402366,0.113041
4,,,


In [246]:
dt = dataframe.add(1, fill_value = 10)

In [247]:
# DataFrame.sub(other, axis=’columns’, level=None, fill_value=None)

sub1 = dt.sub(1)
sub1


Unnamed: 0,A,B,C
0,0.870124,0.582277,0.278839
1,0.185911,0.4111,0.117376
2,0.684969,0.437611,0.556229
3,0.36708,0.402366,0.113041
4,10.0,10.0,10.0


In [249]:
# DataFrame.mul(other, axis=’columns’, level=None, fill_value=None)

mul1 = sub1.mul(2)
mul1

Unnamed: 0,A,B,C
0,1.740248,1.164554,0.557678
1,0.371822,0.8222,0.234751
2,1.369937,0.875222,1.112459
3,0.734161,0.804731,0.226081
4,20.0,20.0,20.0


In [250]:
# DataFrame.div(other, axis=’columns’, level=None, fill_value=None)

div1 = mul1.div(2)
div1

Unnamed: 0,A,B,C
0,0.870124,0.582277,0.278839
1,0.185911,0.4111,0.117376
2,0.684969,0.437611,0.556229
3,0.36708,0.402366,0.113041
4,10.0,10.0,10.0


In [255]:
# unique() Method extracts the unique values in the dataframe

df = pd.read_csv('Automobile_data.csv')
unique_comp = df['company'].unique()
unique_comp

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mitsubishi',
       'nissan', 'porsche', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [257]:
# nunique() Method returns count of the unique values in the dataframe

df = pd.read_csv('Automobile_data.csv')
no_unique_comp = df['company'].nunique()
no_unique_comp

16

In [258]:
# value_counts() Method counts the number of times each unique value occurs within the Series

df = pd.read_csv('Automobile_data.csv')
no_of_times = df['company'].value_counts()
no_of_times

toyota           7
bmw              6
nissan           5
mazda            5
mitsubishi       4
mercedes-benz    4
audi             4
volkswagen       4
jaguar           3
isuzu            3
alfa-romero      3
honda            3
porsche          3
chevrolet        3
dodge            2
volvo            2
Name: company, dtype: int64

In [263]:
# columns the column labels of the DataFrame
col = df.columns
col

Index(['company', 'body-style', 'wheel-base', 'length', 'engine-type',
       'num-of-cylinders', 'horsepower', 'average-mileage', 'price'],
      dtype='object')

In [265]:
# axes a list representing the axes of the DataFrame

axes = df.axes
axes

[RangeIndex(start=0, stop=61, step=1),
 Index(['company', 'body-style', 'wheel-base', 'length', 'engine-type',
        'num-of-cylinders', 'horsepower', 'average-mileage', 'price'],
       dtype='object')]

In [268]:
# between(start, end)

btw = df['price'].between(10000,15000) # price between 10000 - 15000
btw

0      True
1     False
2     False
3      True
4     False
      ...  
56    False
57    False
58    False
59     True
60     True
Name: price, Length: 61, dtype: bool

In [275]:
# DataFrame.isin(values)

req_comp = df['company'].isin(['alfa-romero','bmw'])
req_comp.head()

0     True
1     True
2     True
3    False
4    False
Name: company, dtype: bool

In [276]:
# DataFrame.dtypes - data type of each column.

df.dtypes

company              object
body-style           object
wheel-base          float64
length              float64
engine-type          object
num-of-cylinders     object
horsepower            int64
average-mileage     float64
price               float64
dtype: object

In [278]:
# values() Method returns a Numpy representation of the DataFrame i.e. only the values in the DataFrame will be returned, the axes labels will be removed

value_df = df.values
value_df

array([['alfa-romero', 'convertible', 88.6, 168.8, 'dohc', 'four', 111,
        21.0, 13495.0],
       ['alfa-romero', 'convertible', 88.6, 168.8, 'dohc', 'four', 111,
        nan, 16500.0],
       ['alfa-romero', 'hatchback', 94.5, 171.2, 'ohcv', 'six', 154,
        19.0, 16500.0],
       ['audi', 'sedan', 99.8, 176.6, 'ohc', 'four', 102, 24.0, 13950.0],
       ['audi', 'sedan', 99.4, 176.6, 'ohc', 'five', 115, 18.0, 17450.0],
       ['audi', 'sedan', 99.8, 177.3, 'ohc', 'five', 110, 19.0, 15250.0],
       ['audi', 'wagon', 105.8, 192.7, 'ohc', 'five', 110, 19.0, 18920.0],
       ['bmw', 'sedan', 101.2, 176.8, 'ohc', 'four', 101, 23.0, 16430.0],
       ['bmw', 'sedan', 101.2, 176.8, 'ohc', 'four', 101, 23.0, 16925.0],
       ['bmw', 'sedan', 101.2, 176.8, 'ohc', 'six', 121, 21.0, 20970.0],
       ['bmw', 'sedan', 103.5, 189.0, 'ohc', 'six', 182, 16.0, 30760.0],
       ['bmw', 'sedan', 103.5, 193.8, 'ohc', 'six', 182, 16.0, 41315.0],
       ['bmw', 'sedan', 110.0, 197.0, 'ohc', 'six', 

In [287]:
# sort_values() Method sorts a data frame in Ascending or Descending order of passed Column
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind=’quicksort’, na_position=’last’)

sorted_values = df.sort_values(['company','price'], ascending=[False,True])
sorted_values.head()

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0
60,volvo,wagon,104.3,188.8,ohc,four,114,23.0,13415.0
55,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7775.0
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0


In [289]:
# DataFrame.sort_index(axis=0, level=None, ascending=True, inplace=False, kind=’quicksort’, na_position=’last’, sort_remaining=True, by=None)

df1 = pd.read_csv('Automobile_data.csv', index_col = 'body-style')

sorted_indx = df1.sort_index(axis=0)
sorted_indx.head()

Unnamed: 0_level_0,company,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
body-style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
convertible,alfa-romero,88.6,168.8,dohc,four,111,21.0,13495.0
convertible,alfa-romero,88.6,168.8,dohc,four,111,,16500.0
convertible,porsche,89.5,168.9,ohcf,six,207,17.0,37028.0
hardtop,porsche,89.5,168.9,ohcf,six,207,17.0,34028.0
hardtop,mercedes-benz,112.0,199.2,ohcv,eight,184,14.0,45400.0


In [292]:
# rename() Method is called on a DataFrame to change the names of the index labels or column names
# DataFrame.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None)

df = pd.read_csv('Automobile_data.csv')
new_df = df.rename(columns={'body-style':'body_style', 'engine-type': 'engine_type'})
new_df

Unnamed: 0,company,body_style,wheel-base,length,engine_type,num-of-cylinders,horsepower,average-mileage,price
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
58,volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [294]:
df

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0
58,volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0


In [296]:
# DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=’raise’)

#dropping columns by index label

drop_df = df.drop(['wheel-base'], axis = 1)
drop_df

Unnamed: 0,company,body-style,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
0,alfa-romero,convertible,168.8,dohc,four,111,21.0,13495.0
1,alfa-romero,convertible,168.8,dohc,four,111,,16500.0
2,alfa-romero,hatchback,171.2,ohcv,six,154,19.0,16500.0
3,audi,sedan,176.6,ohc,four,102,24.0,13950.0
4,audi,sedan,176.6,ohc,five,115,18.0,17450.0
...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,171.7,ohc,four,85,27.0,7975.0
57,volkswagen,sedan,171.7,ohc,four,52,37.0,7995.0
58,volkswagen,sedan,171.7,ohc,four,100,26.0,9995.0
59,volvo,sedan,188.8,ohc,four,114,23.0,12940.0


In [297]:
#dropping rows by index label

drop_df = df.drop([0,1,8], axis = 0)
drop_df.head(10)

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0
5,audi,sedan,99.8,177.3,ohc,five,110,19.0,15250.0
6,audi,wagon,105.8,192.7,ohc,five,110,19.0,18920.0
7,bmw,sedan,101.2,176.8,ohc,four,101,23.0,16430.0
9,bmw,sedan,101.2,176.8,ohc,six,121,21.0,20970.0
10,bmw,sedan,103.5,189.0,ohc,six,182,16.0,30760.0
11,bmw,sedan,103.5,193.8,ohc,six,182,16.0,41315.0
12,bmw,sedan,110.0,197.0,ohc,six,182,15.0,36880.0


In [298]:
# DataFrame.pop(item) is used to delete rows or columns from a DataFrame

popped_col = df.pop("length")
popped_col

0     168.8
1     168.8
2     171.2
3     176.6
4     176.6
      ...  
56    171.7
57    171.7
58    171.7
59    188.8
60    188.8
Name: length, Length: 61, dtype: float64

In [300]:
# sample Method pulls out a random sample of rows or columns from a DataFrame
# DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)

rows = df.sample(frac =.10) #10% random sample data is generated out of the Data frame
rows

Unnamed: 0,company,body-style,wheel-base,engine-type,num-of-cylinders,horsepower,average-mileage,price
53,toyota,wagon,95.7,ohc,four,62,27.0,8778.0
60,volvo,wagon,104.3,ohc,four,114,23.0,13415.0
43,nissan,wagon,94.5,ohc,four,69,31.0,7349.0
48,toyota,hatchback,95.7,ohc,four,62,35.0,5348.0
26,jaguar,sedan,102.0,ohcv,twelve,262,13.0,36000.0
57,volkswagen,sedan,97.3,ohc,four,52,37.0,7995.0


In [308]:
# nsmallest() pulls out the rows with the smallest values in a column

least_5 = df.nsmallest(5, "price")
least_5

Unnamed: 0,company,body-style,wheel-base,engine-type,num-of-cylinders,horsepower,average-mileage,price
13,chevrolet,hatchback,88.4,l,three,48,47.0,5151.0
27,mazda,hatchback,93.1,ohc,four,68,30.0,5195.0
48,toyota,hatchback,95.7,ohc,four,62,35.0,5348.0
36,mitsubishi,hatchback,93.7,ohc,four,68,37.0,5389.0
28,mazda,hatchback,93.1,ohc,four,68,31.0,6095.0


In [310]:
# nlargest() pulls out the rows with the largest values in a column

largest_5 = df.nlargest(5, "price")
largest_5

Unnamed: 0,company,body-style,wheel-base,engine-type,num-of-cylinders,horsepower,average-mileage,price
35,mercedes-benz,hardtop,112.0,ohcv,eight,184,14.0,45400.0
11,bmw,sedan,103.5,ohc,six,182,16.0,41315.0
34,mercedes-benz,sedan,120.9,ohcv,eight,184,14.0,40960.0
46,porsche,convertible,89.5,ohcf,six,207,17.0,37028.0
12,bmw,sedan,110.0,ohc,six,182,15.0,36880.0


In [319]:
# shape returns a tuple representing the dimensionality of the DataFrame
# size Returns size of dataframe/series which is equivalent to total number of elements. That is rows x columns.
# ndim ‘int’ representing the number of axes / array dimensions. Returns 1 if Series, otherwise returns 2 if DataFrame
df = pd.read_csv('Automobile_data.csv')
size = df.size
shape = df.shape
dimension = df.ndim

print(f'Size: {size}, Shape: {shape}, Dimension :{dimension}')

Size: 549, Shape: (61, 9), Dimension :2


In [322]:
# DataFrame.rank(axis=0, method=’average’, numeric_only=None, na_option=’keep’, ascending=True, pct=False)

df['Rank'] = df['company'].rank()
df

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0,2.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0,2.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0,2.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0,5.5
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0,5.5
...,...,...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0,57.5
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0,57.5
58,volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0,57.5
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0,60.5


In [323]:
df['Rank'] = df['company'].rank(method = 'average')
df

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0,2.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0,2.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0,2.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0,5.5
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0,5.5
...,...,...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0,57.5
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0,57.5
58,volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0,57.5
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0,60.5


In [324]:
df['Rank'] = df['company'].rank(method = 'min')
df

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21.0,13495.0,1.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,,16500.0,1.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19.0,16500.0,1.0
3,audi,sedan,99.8,176.6,ohc,four,102,24.0,13950.0,4.0
4,audi,sedan,99.4,176.6,ohc,five,115,18.0,17450.0,4.0
...,...,...,...,...,...,...,...,...,...,...
56,volkswagen,sedan,97.3,171.7,ohc,four,85,27.0,7975.0,56.0
57,volkswagen,sedan,97.3,171.7,ohc,four,52,37.0,7995.0,56.0
58,volkswagen,sedan,97.3,171.7,ohc,four,100,26.0,9995.0,56.0
59,volvo,sedan,104.3,188.8,ohc,four,114,23.0,12940.0,60.0


In [359]:
# DataFrame.query(expr, inplace=False, **kwargs)

df_price = df.query("company = 'bmw'" and 'price>40000')
df_price

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank
11,bmw,sedan,103.5,193.8,ohc,six,182,16.0,41315.0,8.0
34,mercedes-benz,sedan,120.9,208.1,ohcv,eight,184,14.0,40960.0,33.0
35,mercedes-benz,hardtop,112.0,199.2,ohcv,eight,184,14.0,45400.0,33.0


In [365]:
# copy() method is used to create a copy of a Pandas object

df1 = df.copy()
df1['new'] = 'girsa'
df1

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank,new
11,bmw,sedan,103.5,193.8,ohc,six,182,16.0,41315.0,8.0,girsa
34,mercedes-benz,sedan,120.9,208.1,ohcv,eight,184,14.0,40960.0,33.0,girsa
35,mercedes-benz,hardtop,112.0,199.2,ohcv,eight,184,14.0,45400.0,33.0,girsa


In [368]:
# DataFrame.duplicated(subset=None, keep='first')

result = df['horsepower'].duplicated()
result

11    False
34    False
35     True
Name: horsepower, dtype: bool

In [369]:
result = df['horsepower'].duplicated(keep = False)
result

11    False
34     True
35     True
Name: horsepower, dtype: bool

In [370]:
df = df[~result]
df

Unnamed: 0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price,Rank
11,bmw,sedan,103.5,193.8,ohc,six,182,16.0,41315.0,8.0


In [371]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 11 to 11
Data columns (total 10 columns):
company             1 non-null object
body-style          1 non-null object
wheel-base          1 non-null float64
length              1 non-null float64
engine-type         1 non-null object
num-of-cylinders    1 non-null object
horsepower          1 non-null int64
average-mileage     1 non-null float64
price               1 non-null float64
Rank                1 non-null float64
dtypes: float64(5), int64(1), object(4)
memory usage: 88.0+ bytes


In [372]:
df = pd.read_csv('Automobile_data.csv')