## Handling Missing Data - replace method

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("weather_data_replace.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [6]:
df.describe().T[["mean"]]

Unnamed: 0,mean
temperature,-28548.714286
windspeed,-28567.285714


In [7]:
df.isna().sum()

Unnamed: 0,0
day,0
temperature,0
windspeed,0
event,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          7 non-null      object
 1   temperature  7 non-null      int64 
 2   windspeed    7 non-null      int64 
 3   event        7 non-null      object
dtypes: int64(2), object(2)
memory usage: 352.0+ bytes


In [9]:
for i in df.columns:
    try:
        print(i," : " ,df[i].mean())
    except:
        print("Column " + i + " is not numeric")

Column day is not numeric
temperature  :  -28548.714285714286
windspeed  :  -28567.285714285714
Column event is not numeric


In [10]:
mintemp = -100
maxtemp = 100
df [ (df.temperature < mintemp) | (df.temperature > maxtemp)]

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,-99999,7,Sunny
3,1/4/2017,-99999,7,0


### Replacing single value

In [11]:
new_df = df.replace(-99999, value=5)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,5,7,Sunny
2,1/3/2017,28,5,Snow
3,1/4/2017,5,7,0
4,1/5/2017,32,5,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


### Replacing list with single value

In [12]:
new_df = df.replace(to_replace=[-99999,-88888, 10, '0'], value=5)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,5,7,Sunny
2,1/3/2017,28,5,Snow
3,1/4/2017,5,7,5
4,1/5/2017,32,5,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,5


### Replacing per column

In [13]:
new_df = df.replace({
        'temperature': -99999,
        'windspeed': -99999,
        'day': '1/1/2017',
        'event': '0'
    }, np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


### Replacing by using mapping

In [14]:
new_df = df.replace({
        -99999: np.nan,
        '0': 'Sunny',
    })
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,Sunny
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,Sunny


## Regex

In [15]:
# when windspeed is 6 mph, 7 mph etc. & temperature is 32 F, 28 F etc.
new_df = df.replace({'temperature': '[A-Za-z]', 'windspeed': '[a-z]'},'', regex=True)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


### Replacing list with another list

In [16]:
df = pd.DataFrame({
    'student': ['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica'],
     'score': ['exceptional','average', 'good', 'poor', 'average', 'exceptional']
})
df

Unnamed: 0,student,score
0,rob,exceptional
1,maya,average
2,parthiv,good
3,tom,poor
4,julian,average
5,erica,exceptional


In [17]:
df.score.value_counts()

Unnamed: 0_level_0,count
score,Unnamed: 1_level_1
exceptional,2
average,2
good,1
poor,1


In [18]:
df.replace(['poor', 'good', 'average', 'exceptional'], [4,8,16,30])

  df.replace(['poor', 'good', 'average', 'exceptional'], [4,8,16,30])


Unnamed: 0,student,score
0,rob,30
1,maya,16
2,parthiv,8
3,tom,4
4,julian,16
5,erica,30


In [19]:
df.replace(['poor', 'average', 'good', 'exceptional'], [2,4,10,30] , inplace=True)
df

  df.replace(['poor', 'average', 'good', 'exceptional'], [2,4,10,30] , inplace=True)


Unnamed: 0,student,score
0,rob,30
1,maya,4
2,parthiv,10
3,tom,2
4,julian,4
5,erica,30


In [20]:
df.score.dtype

dtype('int64')

In [21]:
len(df)

6

In [22]:
def replcevalues():
    result = []
    for i in df.score:

        if int(i) < 5 :
            result.append("Failed")
        elif int(i) > 5 and int(i) < 20:
            result.append("Weak")
        else:
            result.append("Okay")

    print(result)
    return result

In [23]:
df["numnericalscores"]= replcevalues()
df

['Okay', 'Failed', 'Weak', 'Failed', 'Failed', 'Okay']


Unnamed: 0,student,score,numnericalscores
0,rob,30,Okay
1,maya,4,Failed
2,parthiv,10,Weak
3,tom,2,Failed
4,julian,4,Failed
5,erica,30,Okay


## handling_missing_data_fillna_dropna_interpolate

In [26]:
import pandas as pd
df = pd.read_csv("/content/weather_data (1).csv", parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [27]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [28]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


### fillna
##### Fill all NaN with one specific value

In [29]:
new_df = df.fillna(888)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,888.0,9.0,Sunny
2,2017-01-05,28.0,888.0,Snow
3,2017-01-06,888.0,7.0,888
4,2017-01-07,32.0,888.0,Rain
5,2017-01-08,888.0,888.0,Sunny
6,2017-01-09,888.0,888.0,888
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### Fill na using column names and dict

In [30]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed':df["windspeed"].mean(),
        'event': 'No Event'
    })
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,0.0,9.0,Sunny
2,2017-01-05,28.0,8.4,Snow
3,2017-01-06,0.0,7.0,No Event
4,2017-01-07,32.0,8.4,Rain
5,2017-01-08,0.0,8.4,Sunny
6,2017-01-09,0.0,8.4,No Event
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### Use method to determine how to fill na values

In [31]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [32]:
new_df = df.fillna(method="ffill")
new_df

  new_df = df.fillna(method="ffill")


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,7.0,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [33]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [34]:
new_df = df.fillna(method="bfill")
new_df

  new_df = df.fillna(method="bfill")


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,28.0,9.0,Sunny
2,2017-01-05,28.0,7.0,Snow
3,2017-01-06,32.0,7.0,Rain
4,2017-01-07,32.0,8.0,Rain
5,2017-01-08,34.0,8.0,Sunny
6,2017-01-09,34.0,8.0,Cloudy
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### Use of Axis

In [35]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [36]:
new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
new_df

  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01 00:00:00,32.0,6.0,Rain
1,2017-01-04 00:00:00,9.0,9.0,Sunny
2,2017-01-05 00:00:00,28.0,Snow,Snow
3,2017-01-06 00:00:00,7.0,7.0,
4,2017-01-07 00:00:00,32.0,Rain,Rain
5,2017-01-08 00:00:00,Sunny,Sunny,Sunny
6,2017-01-09 00:00:00,NaT,NaT,NaT
7,2017-01-10 00:00:00,34.0,8.0,Cloudy
8,2017-01-11 00:00:00,40.0,12.0,Sunny


#### limit parameter

In [37]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [39]:
new_df = df.fillna(method="ffill",limit=2)
new_df

  new_df = df.fillna(method="ffill",limit=2)


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


### interpolate


In [40]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [41]:
df.set_index('day',inplace=True)

In [42]:
new_df = df.interpolate()
new_df

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [43]:
new_df = df.interpolate(method="time")
new_df

  new_df = df.interpolate(method="time")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [44]:
new_df.temperature = new_df.temperature.astype(int)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6.0,Rain
2017-01-04,29,9.0,Sunny
2017-01-05,28,8.0,Snow
2017-01-06,30,7.0,
2017-01-07,32,7.25,Rain
2017-01-08,32,7.5,Sunny
2017-01-09,33,7.75,
2017-01-10,34,8.0,Cloudy
2017-01-11,40,12.0,Sunny


### dropna

In [45]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [46]:
new_df = df.dropna()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [47]:
new_df = df.dropna(how='all')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [48]:
new_df = df.dropna(thresh=1)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### Inserting Missing Dates

In [49]:
dt = pd.date_range("01-01-2017","01-11-2017")
print(dt)
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


In [50]:
newdf = df.drop(["event","windspeed"], axis = 1) #drop columns
newdf

Unnamed: 0_level_0,temperature
day,Unnamed: 1_level_1
2017-01-01,32.0
2017-01-04,
2017-01-05,28.0
2017-01-06,
2017-01-07,32.0
2017-01-08,
2017-01-09,
2017-01-10,34.0
2017-01-11,40.0
