# Pandas

#### dataframe

In [47]:
# Creation of DataFrame from dict:

import pandas as pd
import numpy as np

student_dict = {'Name':['Sachin','Ram'],'Age':[20,21],'Marks':[85.10,77.80]}

data = pd.DataFrame(student_dict)

print(data)

    Name   Age  Marks
0  Sachin  20   85.1 
1     Ram  21   77.8 


In [2]:
# Creation of DataFrame from list

data = [['Sachin',25,'Engineer'],['Anil',26,'Data Scientist'],['Hariom',21,'Doctor']]

columns = ['Name','Age','Occupation']

df = pd.DataFrame(data,columns = columns)

print(df)

     Name  Age      Occupation
0  Sachin   25        Engineer
1    Anil   26  Data Scientist
2  Hariom   21          Doctor


In [34]:
#DataFrame modification 

# Insert columns

#df.insert(loc= col_position,column = new_column, value = default value)

df.insert(loc=2,column="Gender",value="M")
print(df)

    Name   Age Gender    Occupation  
0  Sachin  25     M          Engineer
1    Anil  26     M    Data Scientist
2  Hariom  21     M            Doctor


In [36]:
student_dict = {"name":["Joe","Nat","Harry","Sam" ], "age":[20,21,19,21], "marks":[85.10, 77.80, 91.54, 77.80]}

# Create DataFrame from dict
student_df = pd.DataFrame(student_dict)
print(student_df)
print("-----------------------------")

# drop duplicate rows
student_df = student_df.drop_duplicates(subset=['age','marks'])

print(student_df)

   name   age  marks
0    Joe  20   85.10
1    Nat  21   77.80
2  Harry  19   91.54
3    Sam  21   77.80
-----------------------------
   name   age  marks
0    Joe  20   85.10
1    Nat  21   77.80
2  Harry  19   91.54


In [44]:
student_dict = {"name": ["Joe", "Nat", "Harry", "Nat"], "age": [20, 21, 19, 21], "marks": [85.10, 77.80, 91.54, 77.80]}

student_df = pd.DataFrame(student_dict, index=['a', 'b', 'c', 'd'])
print(student_df)

# if ignore_index = True #it reset the row labels of resultant DataFrame to 0, 1, …, n – 1.
# if ignore_index = False : it does not change the original row index , by default it is false

print("-------------------")
# drop duplicate rows
student_df = student_df.drop_duplicates(keep=False, ignore_index=False)

print(student_df)
print("-------------------")

student_df = student_df.drop_duplicates(keep=False, ignore_index=True)

print(student_df)

   name   age  marks
a    Joe  20   85.10
b    Nat  21   77.80
c  Harry  19   91.54
d    Nat  21   77.80
-------------------
   name   age  marks
a    Joe  20   85.10
c  Harry  19   91.54
-------------------
   name   age  marks
0    Joe  20   85.10
1  Harry  19   91.54


In [48]:
student_dict = {"name": ["Joe", "Sam", "Harry"], "age": [20, 21, 19], "marks": [85.10, np.nan, 91.54]}

student_df = pd.DataFrame(student_dict)
print(student_df)

student_df = student_df.dropna(axis='columns')

print(student_df)

   name   age  marks
0    Joe  20   85.10
1    Sam  21     NaN
2  Harry  19   91.54
   name   age
0    Joe  20 
1    Sam  21 
2  Harry  19 


In [50]:
#If how='all', it drops the column where all the values are NA.
#By default, how='any', it removes the columns where one or more values are NA.

student_dict = {"name": ["Joe", "Sam", np.nan, "Harry"], "age": [np.nan, np.nan, np.nan, np.nan],
                "marks": [85.10, np.nan, np.nan, 91.54]}

# Create DataFrame from dict
student_df = pd.DataFrame(student_dict)
print(student_df)
print("------------------")
student_df = student_df.dropna(axis='columns',how='all')
print(student_df)

   name   age  marks
0    Joe NaN   85.10
1    Sam NaN     NaN
2    NaN NaN     NaN
3  Harry NaN   91.54
------------------
   name   marks
0    Joe  85.10
1    Sam    NaN
2    NaN    NaN
3  Harry  91.54


In [52]:
#Drop column with the number of NA

#set thresh = no. of NA values

student_dict = {"name": ["Joe", "Sam", np.nan, "Harry"], "age": [np.nan, np.nan, np.nan, np.nan],
                "marks": [85.10, np.nan, np.nan, 91.54]}

student_df = pd.DataFrame(student_dict)
print(student_df)

print("---------------------")

#keep column with 3 or more non-NA values
student_df = student_df.dropna(axis='columns',thresh = 3)
print(student_df)

   name   age  marks
0    Joe NaN   85.10
1    Sam NaN     NaN
2    NaN NaN     NaN
3  Harry NaN   91.54
---------------------
   name 
0    Joe
1    Sam
2    NaN
3  Harry


In [54]:
# Dropna from defined row 
student_dict = {"name": ["Joe", "Sam", "Harry"], "age": [np.nan, np.nan, np.nan], "marks": [85.10, np.nan, 91.54]}

student_df = pd.DataFrame(student_dict)
print(student_df)

print("-----------------------")

student_df = student_df.dropna(axis='columns', subset=[0, 2])

print(student_df)

print("-----------------------")

student_df.dropna(inplace=True)

print(student_df)

   name   age  marks
0    Joe NaN   85.10
1    Sam NaN     NaN
2  Harry NaN   91.54
-----------------------
   name   marks
0    Joe  85.10
1    Sam    NaN
2  Harry  91.54
-----------------------
   name   marks
0    Joe  85.10
2  Harry  91.54


In [55]:
# drop a number of columns
student_dict = {"name": ["Joe", "Nat"], "age": [20, 21], "marks": [85.10, 77.80]}

student_df = pd.DataFrame(student_dict)
print(student_df.columns.values)

# drop 2 columns at a time
student_df = student_df.drop(columns=['age', 'marks'])

print(student_df.columns.values)

['name' 'age' 'marks']
['name']


In [72]:
#DataFrame Join

student_dict = {'Name': ['Joe', 'Nat'], 'Age': [20, 21]}
student_df = pd.DataFrame(student_dict)
print(student_df)
print("-----------------------")

# create dataframe from dict 
marks_dict = {'Marks': [85.10, 77.80]}
marks_df = pd.DataFrame(marks_dict)
print(marks_df)
print("-----------------------")

#join dataframes

joined_df = student_df.join(marks_df)
print(joined_df)
print("------------------------")

# apply groupby on dataframe

student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Class': ['A', 'B', 'A'], 'Marks': [85.10, 77.80, 91.54]}
student_df = pd.DataFrame(student_dict)
print(student_df)

print("-----------------------")
student_df = student_df.groupby('Class').mean()
print(student_df)

  Name  Age
0  Joe  20 
1  Nat  21 
-----------------------
   Marks
0  85.1 
1  77.8 
-----------------------
  Name  Age  Marks
0  Joe  20   85.1 
1  Nat  21   77.8 
------------------------
   Name  Class  Marks
0    Joe    A   85.10
1    Nat    B   77.80
2  Harry    A   91.54
-----------------------
       Marks
Class       
A      88.32
B      77.80


  student_df = student_df.groupby('Class').mean()


In [76]:
#dataframe sorting

student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}
student_df = pd.DataFrame(student_dict)
print(student_df)
print("---------------------")

#sort dataframe by "Marks"

print(student_df.sort_values(by = ['Marks']))
print("--------------------------")

#convert dataframe to dict

print(student_df.to_dict())

   Name   Age  Marks
0    Joe  20   85.10
1    Nat  21   77.80
2  Harry  19   91.54
---------------------
   Name   Age  Marks
1    Nat  21   77.80
0    Joe  20   85.10
2  Harry  19   91.54
--------------------------
{'Name': {0: 'Joe', 1: 'Nat', 2: 'Harry'}, 'Age': {0: 20, 1: 21, 2: 19}, 'Marks': {0: 85.1, 1: 77.8, 2: 91.54}}


###### Data manipulation on "Automobile_data.csv"

In [13]:
#To customize the display of DataFrame while printing

# show all rows
pd.set_option('display.max_rows',None)

# show all columns
pd.set_option('display.max_columns',None)

#set display width
pd.set_option("display.width",1000)

#center-align column headers
pd.set_option('display.colheader_justify','center')



In [14]:
auto = pd.read_csv('Automobile_data.csv',index_col=0)

In [65]:
auto.head()

Unnamed: 0_level_0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18,17450.0


In [11]:
auto.head() # first 5 rows

Unnamed: 0_level_0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18,17450.0


In [12]:
auto.tail() #last5 rows

Unnamed: 0_level_0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
81,volkswagen,sedan,97.3,171.7,ohc,four,85,27,7975.0
82,volkswagen,sedan,97.3,171.7,ohc,four,52,37,7995.0
86,volkswagen,sedan,97.3,171.7,ohc,four,100,26,9995.0
87,volvo,sedan,104.3,188.8,ohc,four,114,23,12940.0
88,volvo,wagon,104.3,188.8,ohc,four,114,23,13415.0


In [16]:
auto.info() # function that gives metadata of DataFrame

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 88
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           61 non-null     object 
 1   body-style        61 non-null     object 
 2   wheel-base        61 non-null     float64
 3   length            61 non-null     float64
 4   engine-type       61 non-null     object 
 5   num-of-cylinders  61 non-null     object 
 6   horsepower        61 non-null     int64  
 7   average-mileage   61 non-null     int64  
 8   price             58 non-null     float64
dtypes: float64(3), int64(2), object(4)
memory usage: 4.8+ KB


In [17]:
auto.describe() # get the statistics of DataFrame

Unnamed: 0,wheel-base,length,horsepower,average-mileage,price
count,61.0,61.0,61.0,61.0,58.0
mean,98.481967,173.098361,107.852459,25.803279,15387.0
std,6.679234,14.021846,53.524398,8.129821,11320.259841
min,88.4,141.1,48.0,13.0,5151.0
25%,94.5,159.1,68.0,19.0,6808.5
50%,96.3,171.2,100.0,25.0,11095.0
75%,101.2,177.3,123.0,31.0,18120.5
max,120.9,208.1,288.0,47.0,45400.0


###### data attributes 

In [19]:
#1
auto.index  #It gives the Range of the row index

Int64Index([0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 66, 67, 68, 69, 70, 71, 79, 80, 81, 82, 86, 87, 88], dtype='int64', name='index')

In [20]:
#2
auto.columns # it gives a list of column labels

Index(['company', 'body-style', 'wheel-base', 'length', 'engine-type', 'num-of-cylinders', 'horsepower', 'average-mileage', 'price'], dtype='object')

In [21]:
#3
auto.dtypes # it gives column names and their data type

company              object
body-style           object
wheel-base          float64
length              float64
engine-type          object
num-of-cylinders     object
horsepower            int64
average-mileage       int64
price               float64
dtype: object

In [66]:
#4
#auto.values # it gives all the rows in DataFrame

In [23]:
#5
auto.empty # it is used to check if the DataFrame is empty

False

In [24]:
#6
auto.size # it gives a total number of values in dataframe

549

In [25]:
#7
auto.shape #It a number of rows and columns in DataFrame


(61, 9)

###### DataFrame selection

In [28]:
auto.at[0,'company'] # select value at particular position

'alfa-romero'

In [29]:
auto.iat[0,0] # select value by indexs

'alfa-romero'

In [67]:
# auto.get('company') # select an individual columns

In [32]:
auto.loc[0:2,['company']] # select values from row index 0 to 2 and 'company' columns

Unnamed: 0_level_0,company
index,Unnamed: 1_level_1
0,alfa-romero
1,alfa-romero
2,alfa-romero


In [33]:
auto.iloc[0:2,0:2]

Unnamed: 0_level_0,company,body-style
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,alfa-romero,convertible
1,alfa-romero,convertible


In [58]:
#Apply conditions


# if we need to update teh value in the dataframe based on some condition DataFrame.where() function is used to 
      # replace the value of dataframe where the condition is false

# where(filter,other=new_value)

filter = auto['price']>16500

auto_df = auto['price'].where(filter,other = 0) # auto_df is duplicate dataframe to perform some operation ]
print(auto_df.head()) ## all the values less than 16500 would be 0

index
0        0.0
1        0.0
2        0.0
3        0.0
4    17450.0
Name: price, dtype: float64


In [63]:
auto_df = auto.rename(columns = {'price':'Price'})

In [64]:
auto_df.head()

Unnamed: 0_level_0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,Price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495.0
1,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500.0
2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500.0
3,audi,sedan,99.8,176.6,ohc,four,102,24,13950.0
4,audi,sedan,99.4,176.6,ohc,five,115,18,17450.0


###### Questions on AutoMobile_data

In [80]:
auto.nunique()

company             16
body-style           5
wheel-base          27
length              36
engine-type          7
num-of-cylinders     7
horsepower          30
average-mileage     21
price               57
dtype: int64

In [82]:
#1 count total cars per company
auto['company'].value_counts()

toyota           7
bmw              6
mazda            5
nissan           5
audi             4
mercedes-benz    4
mitsubishi       4
volkswagen       4
alfa-romero      3
chevrolet        3
honda            3
isuzu            3
jaguar           3
porsche          3
dodge            2
volvo            2
Name: company, dtype: int64

In [83]:
auto['company'].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mitsubishi',
       'nissan', 'porsche', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [86]:
#2 find the most expensive car company name

print(auto[['company','price']][auto.price==auto['price'].max()])

         company       price 
index                        
47     mercedes-benz  45400.0


In [90]:
#3. print all toyota car details

#1 st method
print(auto[auto['company']=="toyota"])
#2 method
print(auto.groupby('company').get_group('toyota'))

      company body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage   price 
index                                                                                                          
66     toyota  hatchback      95.7     158.7      ohc          four             62           35          5348.0
67     toyota  hatchback      95.7     158.7      ohc          four             62           31          6338.0
68     toyota  hatchback      95.7     158.7      ohc          four             62           31          6488.0
69     toyota      wagon      95.7     169.7      ohc          four             62           31          6918.0
70     toyota      wagon      95.7     169.7      ohc          four             62           27          7898.0
71     toyota      wagon      95.7     169.7      ohc          four             62           27          8778.0
79     toyota      wagon     104.5     187.8     dohc           six            156           19         

In [95]:
#4. find each company's Highest price car
print(auto.groupby('company')['price'].max())

company
alfa-romero      16500.0
audi             18920.0
bmw              41315.0
chevrolet         6575.0
dodge             6377.0
honda            12945.0
isuzu             6785.0
jaguar           36000.0
mazda            18344.0
mercedes-benz    45400.0
mitsubishi        8189.0
nissan           13499.0
porsche          37028.0
toyota           15750.0
volkswagen        9995.0
volvo            13415.0
Name: price, dtype: float64


In [98]:
#5 find the average price of each car making company

print(auto.groupby('company')['average-mileage'].mean())

company
alfa-romero      20.333333
audi             20.000000
bmw              19.000000
chevrolet        41.000000
dodge            31.000000
honda            26.333333
isuzu            33.333333
jaguar           14.333333
mazda            28.000000
mercedes-benz    18.000000
mitsubishi       29.500000
nissan           31.400000
porsche          17.000000
toyota           28.714286
volkswagen       31.750000
volvo            23.000000
Name: average-mileage, dtype: float64


In [99]:
#6. sort all cars by price columns
auto.sort_values(by='price')

Unnamed: 0_level_0,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16,chevrolet,hatchback,88.4,141.1,l,three,48,47,5151.0
36,mazda,hatchback,93.1,159.1,ohc,four,68,30,5195.0
66,toyota,hatchback,95.7,158.7,ohc,four,62,35,5348.0
49,mitsubishi,hatchback,93.7,157.3,ohc,four,68,37,5389.0
37,mazda,hatchback,93.1,159.1,ohc,four,68,31,6095.0
50,mitsubishi,hatchback,93.7,157.3,ohc,four,68,31,6189.0
20,dodge,hatchback,93.7,157.3,ohc,four,68,31,6229.0
17,chevrolet,hatchback,94.5,155.9,ohc,four,70,38,6295.0
67,toyota,hatchback,95.7,158.7,ohc,four,62,31,6338.0
19,dodge,hatchback,93.7,157.3,ohc,four,68,31,6377.0
