## Pandas Basics:

In [9]:
import pandas as pd

In [10]:
s = pd.Series([10,20,30,40],index=['a','b','c','d'])
print("Series:\n",s)

Series:
 a    10
b    20
c    30
d    40
dtype: int64


## Example 1:

In [11]:
prices = [649000,390102,726354,1763839,1093374]
carnames = ['swift','santro','audi','elantra','bolero']

In [12]:
car_series = pd.Series(data = prices,index = carnames)
car_series

swift       649000
santro      390102
audi        726354
elantra    1763839
bolero     1093374
dtype: int64

## Example 2:(Accessing data from series using logical conditions)

In [14]:
entries = {
    'swift' : 649000,
    'santro' : 390102,
    'audi' : 726354,
    'elantra' : 1763839,
    'bolero' : 1093374
}
car_series1 = pd.Series(data = entries,name = 'price')
car_series1

swift       649000
santro      390102
audi        726354
elantra    1763839
bolero     1093374
Name: price, dtype: int64

In [15]:
car_series1 > 1000000

swift      False
santro     False
audi       False
elantra     True
bolero      True
Name: price, dtype: bool

In [16]:
car_series1[car_series1 > 1000000]

elantra    1763839
bolero     1093374
Name: price, dtype: int64

## ðŸ§ Quiz1:

In [17]:
cust_names = ['Hemag','Farheen','Himadri','Monisha']
cust_bill = [256.78,434.53,109.25,529.42]
cust_info = pd.Series(cust_bill, cust_names)

In [19]:
cust_info[cust_info > 300]

Farheen    434.53
Monisha    529.42
dtype: float64

## Example 3:(Accessing data from series using `.loc[]` method)

In [2]:
import pandas as pd
entries = {
    'swift' : 649000,
    'santro' : 390102,
    'audi' : 726354,
    'elantra' : 1763839,
    'bolero' : 1093374
}
car_series1 = pd.Series(data = entries,name = 'price')
car_series1

swift       649000
santro      390102
audi        726354
elantra    1763839
bolero     1093374
Name: price, dtype: int64

In [3]:
car_series1.loc['swift']

np.int64(649000)

In [5]:
car_series1.loc[['swift']]

swift    649000
Name: price, dtype: int64

In [7]:
car_series1.loc['swift':'audi']

swift     649000
santro    390102
audi      726354
Name: price, dtype: int64

## Example 4:(Accessing data from series using `.iloc[]` method)

In [8]:
import pandas as pd
entries = {
    'swift' : 649000,
    'santro' : 390102,
    'audi' : 726354,
    'elantra' : 1763839,
    'bolero' : 1093374
}
car_series1 = pd.Series(data = entries,name = 'price')
car_series1

swift       649000
santro      390102
audi        726354
elantra    1763839
bolero     1093374
Name: price, dtype: int64

In [9]:
car_series1.iloc[0]

np.int64(649000)

In [14]:
car_series1.iloc[[0]]

swift    649000
Name: price, dtype: int64

In [13]:
car_series1.iloc[0:3]

swift     649000
santro    390102
audi      726354
Name: price, dtype: int64

## ðŸ§ Quiz2:

In [15]:
import pandas as pd

In [17]:
cust_names = ['Mahesh','Farheen','Himdari','Dalle']
cust_bill = [256.78,434.53,109.25,529.4]
cust_info = pd.Series(cust_bill,cust_names)

In [18]:
cust_info

Mahesh     256.78
Farheen    434.53
Himdari    109.25
Dalle      529.40
dtype: float64

In [27]:
cust_info.iloc[0:4:3]

Mahesh    256.78
Dalle     529.40
dtype: float64

In [32]:
cust_info.loc[['Mahesh','Dalle']]

Mahesh    256.78
Dalle     529.40
dtype: float64

## Handling DataFrames:

1) DataFrame

In [39]:
import pandas
df = pandas.DataFrame({
    'Product' : ['A','B','C','D'],
    'Price' : [350,450,550,650],
    'Quantity' : [4,5,6,7]
})
print("\nDataFrame:\n",df)


DataFrame:
   Product  Price  Quantity
0       A    350         4
1       B    450         5
2       C    550         6
3       D    650         7


2) CSV file

In [36]:
import pandas as pd

In [37]:
df_csv = pd.DataFrame({
    'Product' : ['Soap','Shampoo','Socks','Commashock'],
    'Price' : [235,445,120,1005],
    'Quantity' : [100,45,135,5]
})
df_csv

Unnamed: 0,Product,Price,Quantity
0,Soap,235,100
1,Shampoo,445,45
2,Socks,120,135
3,Commashock,1005,5


In [42]:
#Save to CSV for use 
df_csv.to_csv("retail_sales.csv", index = False)
print('file created successfully!')

file created successfully!


In [44]:
#Read CSV file using Pandas
sales_data = pd.read_csv("retail_sales.csv")
print("\n Csv data from 'retail_sales.csv':\n\n",sales_data)


 Csv data from 'retail_sales.csv':

       Product  Price  Quantity
0        Soap    235       100
1     Shampoo    445        45
2       Socks    120       135
3  Commashock   1005         5


In [53]:
sales_data[['Price','Quantity']] ##quiz

Unnamed: 0,Price,Quantity
0,235,100
1,445,45
2,120,135
3,1005,5


## Example 5:(Loading Data & Finding missing values)

In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("data_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,1,41.0,190.0,7.4,67,5,1,2010,67,S
1,2,36.0,118.0,8.0,72,5,2,2010,72,C
2,3,12.0,149.0,12.6,74,5,3,2010,74,PS
3,4,18.0,313.0,11.5,62,5,4,2010,62,S
4,5,,,14.3,56,5,5,2010,56,S


In [25]:
sales_data.isnull() #tells u wheather there is a data or not ...

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
153,False,False,False,False,False,False,False,False,False,False
154,False,False,False,False,False,False,False,False,False,False
155,False,True,False,False,False,False,False,False,False,False
156,False,False,False,False,False,False,False,False,False,False


In [26]:
df.isnull().sum() #if the sum is 0 it means data is not missing...

Unnamed: 0     0
Ozone         38
Solar.R        7
Wind           0
Temp C         0
Month          0
Day            0
Year           0
Temp           0
Weather        3
dtype: int64

## Example 6:(Imputation with Mean/Median/Mode)

In [27]:
df['Ozone']

0      41.0
1      36.0
2      12.0
3      18.0
4       NaN
       ... 
153    41.0
154    30.0
155     NaN
156    14.0
157    18.0
Name: Ozone, Length: 158, dtype: float64

In [28]:
#Use median for (skewed data)
ozone_median = df['Ozone'].median()
print(ozone_median)

30.5


In [31]:
df['Ozone'].fillna(ozone_median,inplace = True) #For replacing Na to 30.5

In [32]:
df['Ozone']

0      41.0
1      36.0
2      12.0
3      18.0
4      30.5
       ... 
153    41.0
154    30.0
155    30.5
156    14.0
157    18.0
Name: Ozone, Length: 158, dtype: float64

In [33]:
df['Solar.R']

0      190.0
1      118.0
2      149.0
3      313.0
4        NaN
       ...  
153    190.0
154    193.0
155    145.0
156    191.0
157    131.0
Name: Solar.R, Length: 158, dtype: float64

In [34]:
#Use Mean for (Normal distribution)
solar_median = df['Solar.R'].mean()
print(solar_median)

185.40397350993376


In [35]:
df['Solar.R'].fillna(solar_median,inplace = True)

In [37]:
df['Solar.R']

0      190.000000
1      118.000000
2      149.000000
3      313.000000
4      185.403974
          ...    
153    190.000000
154    193.000000
155    145.000000
156    191.000000
157    131.000000
Name: Solar.R, Length: 158, dtype: float64

In [38]:
df['Weather']

0       S
1       C
2      PS
3       S
4       S
       ..
153     C
154    PS
155     S
156     S
157     C
Name: Weather, Length: 158, dtype: object

In [46]:
#Use Mode for (Categorial/Object)
weather_mode = df['Weather'].mode()[0]
print(weather_mode)

S


In [41]:
df['Weather'].fillna(weather_mode,inplace = True)

In [42]:
df['Weather']

0       S
1       C
2      PS
3       S
4       S
       ..
153     C
154    PS
155     S
156     S
157     C
Name: Weather, Length: 158, dtype: object

In [45]:
df.isnull().sum() #Means data is clean now

Unnamed: 0    0
Ozone         0
Solar.R       0
Wind          0
Temp C        0
Month         0
Day           0
Year          0
Temp          0
Weather       0
dtype: int64

## ðŸ§ Quiz3:

In [47]:
#Fill missing Temp values with median
df['Temp']

0      67
1      72
2      74
3      62
4      56
       ..
153    67
154    70
155    77
156    75
157    76
Name: Temp, Length: 158, dtype: int64

In [50]:
temp_median = df['Temp'].median()

In [51]:
df['Temp'].fillna(temp_median,inplace=True)
print(df[['Temp']].head(5))

   Temp
0    67
1    72
2    74
3    62
4    56


## Example 7:(Group by 'Weather' and Find Temperature)

In [52]:
weather_group = df.groupby('Weather')
print(weather_group['Temp'].mean())

Weather
C     77.734694
PS    76.872340
S     78.370968
Name: Temp, dtype: float64


In [53]:
#Q: Group by 'Month' and find max 'Wind' speed 
month_group = df.groupby('Month')
print(month_group['Wind'].max())

Month
5      20.1
6      20.7
7      14.9
8      15.5
9      16.6
May    12.0
Name: Wind, dtype: float64


## Example 8:(Sort by Temperature[Descending])

In [54]:
sorted_df = df.sort_values(by = 'Temp',ascending = False) #descending
sorted_df

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
119,120,76.0,203.000000,9.7,97,8,28,2010,97,S
121,122,84.0,237.000000,6.3,96,8,30,2010,96,S
120,121,118.0,225.000000,2.3,94,8,29,2010,94,S
122,123,85.0,188.000000,6.3,94,8,31,2010,94,C
125,126,73.0,183.000000,2.8,93,9,3,2010,93,C
...,...,...,...,...,...,...,...,...,...,...
25,26,30.5,266.000000,14.9,58,5,26,2010,58,C
17,18,6.0,78.000000,18.4,57,5,18,2010,57,C
26,27,30.5,185.403974,8.0,57,5,27,2010,57,PS
24,25,30.5,66.000000,16.6,57,5,25,2010,57,PS


In [55]:
sorted_df = df.sort_values(by = 'Temp',ascending = True) #ascending
sorted_df

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp C,Month,Day,Year,Temp,Weather
4,5,30.5,185.403974,14.3,56,5,5,2010,56,S
26,27,30.5,185.403974,8.0,57,5,27,2010,57,PS
24,25,30.5,66.000000,16.6,57,5,25,2010,57,PS
17,18,6.0,78.000000,18.4,57,5,18,2010,57,C
25,26,30.5,266.000000,14.9,58,5,26,2010,58,C
...,...,...,...,...,...,...,...,...,...,...
126,127,91.0,189.000000,4.6,93,9,4,2010,93,PS
120,121,118.0,225.000000,2.3,94,8,29,2010,94,S
122,123,85.0,188.000000,6.3,94,8,31,2010,94,C
121,122,84.0,237.000000,6.3,96,8,30,2010,96,S
