# Data Preprocessing for Climate Data 


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
%matplotlib inline

# Reading Dataset
### Dates which are present in the vegetable dataset those only consider for processing

In [3]:
climate = pd.read_csv('Climate_Nagpur.csv')
vegetables = pd.read_csv('Nagpur_Oranges.csv')
climate.head()
#len(vegetables)

Unnamed: 0,Year,Month,Date,Temp_Max,Temp_Avg,Temp_Min,Dew_Max,Dew_Avg,Dew_Min,Humid_Max,Humid_Avg,Humid_Min,Wind_Max,Wind_Avg,Wind_Min,Pressure_Max,Pressure_Avg,Pressure_Min,Precipitation_Total
0,2019,Jan,1,82,65.6,50,45,38.1,32,67,40.1,16,3,1.3,0,29.1,29.0,29.0,0
1,2019,Jan,2,86,68.2,50,52,43.5,37,72,45.3,18,5,0.8,0,29.1,29.0,28.9,0
2,2019,Jan,3,84,67.8,54,52,46.7,43,77,49.9,25,5,0.8,0,29.1,29.0,29.0,0
3,2019,Jan,4,82,67.9,54,50,47.0,43,77,50.3,28,3,1.4,0,29.1,28.4,0.0,0
4,2019,Jan,5,82,68.4,54,54,51.5,46,82,57.0,35,5,1.5,0,29.1,27.1,0.0,0


# String is converted into Required Format 

In [4]:
climate['Date Format'] = climate['Year'].map(str) + "-" + climate['Month'].astype(str) + "-" + climate['Date'].astype(str)
climate['Date Format'] = pd.to_datetime(climate['Date Format'])
climate.head()

Unnamed: 0,Year,Month,Date,Temp_Max,Temp_Avg,Temp_Min,Dew_Max,Dew_Avg,Dew_Min,Humid_Max,Humid_Avg,Humid_Min,Wind_Max,Wind_Avg,Wind_Min,Pressure_Max,Pressure_Avg,Pressure_Min,Precipitation_Total,Date Format
0,2019,Jan,1,82,65.6,50,45,38.1,32,67,40.1,16,3,1.3,0,29.1,29.0,29.0,0,2019-01-01
1,2019,Jan,2,86,68.2,50,52,43.5,37,72,45.3,18,5,0.8,0,29.1,29.0,28.9,0,2019-01-02
2,2019,Jan,3,84,67.8,54,52,46.7,43,77,49.9,25,5,0.8,0,29.1,29.0,29.0,0,2019-01-03
3,2019,Jan,4,82,67.9,54,50,47.0,43,77,50.3,28,3,1.4,0,29.1,28.4,0.0,0,2019-01-04
4,2019,Jan,5,82,68.4,54,54,51.5,46,82,57.0,35,5,1.5,0,29.1,27.1,0.0,0,2019-01-05


In [5]:
vegetables.head()
vegetables.shape
climate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790 entries, 0 to 789
Data columns (total 20 columns):
Year                   790 non-null int64
Month                  790 non-null object
Date                   790 non-null int64
Temp_Max               790 non-null int64
Temp_Avg               790 non-null float64
Temp_Min               790 non-null int64
Dew_Max                790 non-null int64
Dew_Avg                790 non-null float64
Dew_Min                790 non-null int64
Humid_Max              790 non-null int64
Humid_Avg              790 non-null float64
Humid_Min              790 non-null int64
Wind_Max               790 non-null int64
Wind_Avg               790 non-null float64
Wind_Min               790 non-null int64
Pressure_Max           790 non-null float64
Pressure_Avg           790 non-null float64
Pressure_Min           790 non-null float64
Precipitation_Total    790 non-null int64
Date Format            790 non-null datetime64[ns]
dtypes: datetime64[ns](1), floa

In [6]:
vegetables = vegetables[vegetables['District Name'] == "Nagpur"]
vegetables = vegetables[vegetables['Market Name'] == "Nagpur"]
vegetables = vegetables[vegetables['Commodity'] == "Orange"]
vegetables.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
15,16,Nagpur,Nagpur,Orange,Other,FAQ,900,3100,2550,30-Dec-19
16,17,Nagpur,Nagpur,Orange,Other,FAQ,800,3100,2525,28-Dec-19
17,18,Nagpur,Nagpur,Orange,Other,FAQ,800,3100,2525,26-Dec-19
18,19,Nagpur,Nagpur,Orange,Other,FAQ,900,3200,2625,24-Dec-19
19,20,Nagpur,Nagpur,Orange,Other,FAQ,900,3000,2475,23-Dec-19


# Function to generate dates 

In [7]:
pd.date_range(end='1/1/2018', periods= 2)

DatetimeIndex(['2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq='D')

In [8]:
vegetables['Price Date'] = pd.to_datetime(vegetables['Price Date'])
vegetables = vegetables.iloc[::-1]
#vegetables.head(10)

# Logic for averaging all parameters

In [9]:
final = []
days_required = 540.0
for row1 in vegetables['Price Date']:
    Temp_Avg, Dew_Avg,Humid_Avg,Wind_Avg,Pressure_Avg, Precipitation_Total = 0.0,0.0,0.0,0.0,0.0,0.0
    Temp_Max, Temp_Min, Dew_Max, Dew_Min, Humid_Max, Humid_Min,Wind_Max, Wind_Min, Pressure_Max, Pressure_Min = 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
    for row2 in pd.date_range(end=row1, periods= days_required):
        try:
            temp = climate[climate['Date Format'] == row2]
            
            Temp_Max += float(temp['Temp_Max'])
            Temp_Avg += float(temp['Temp_Avg'])
            Temp_Min += float(temp['Temp_Min'])
        
            Dew_Max += float(temp['Dew_Max'])
            Dew_Avg += float(temp['Dew_Avg'])
            Dew_Min += float(temp['Dew_Min'])
        
            Humid_Max += float(temp['Humid_Max'])
            Humid_Avg += float(temp['Humid_Avg'])
            Humid_Min += float(temp['Humid_Min'])
        
            Wind_Max += float(temp['Wind_Max'])
            Wind_Avg += float(temp['Wind_Avg'])
            Wind_Min += float(temp['Wind_Min'])
        
            Pressure_Avg += float(temp['Pressure_Avg'])
        
            Precipitation_Total += float(temp['Precipitation_Total'])
        except:
            continue
        
    final.append([row1,Temp_Max/days_required,Temp_Avg / days_required, Temp_Min / days_required,Dew_Min/ days_required,Dew_Avg/ days_required, Dew_Min/ days_required, Humid_Max/ days_required,Humid_Avg/ days_required, Humid_Min/ days_required, Wind_Max/ days_required, Wind_Avg/ days_required, Wind_Min/ days_required, Pressure_Max/ days_required, Pressure_Avg/ days_required, Pressure_Min/ days_required, Precipitation_Total/ days_required])

In [10]:
final

[[Timestamp('2019-01-01 00:00:00'),
  62.4,
  54.969259259259275,
  44.879629629629626,
  36.48888888888889,
  41.33962962962962,
  36.48888888888889,
  52.25925925925926,
  38.30814814814811,
  24.81851851851852,
  6.688888888888889,
  2.395925925925926,
  0.18888888888888888,
  0.0,
  19.386481481481447,
  0.0,
  0.0],
 [Timestamp('2019-01-02 00:00:00'),
  62.55925925925926,
  55.09555555555557,
  44.97222222222222,
  36.55740740740741,
  41.420185185185176,
  36.55740740740741,
  52.39259259259259,
  38.392037037037,
  24.85185185185185,
  6.698148148148148,
  2.397407407407407,
  0.18888888888888888,
  0.0,
  19.44018518518515,
  0.0,
  0.0],
 [Timestamp('2019-01-03 00:00:00'),
  62.714814814814815,
  55.22111111111113,
  45.07222222222222,
  36.63703703703704,
  41.50666666666666,
  36.63703703703704,
  52.535185185185185,
  38.48444444444441,
  24.89814814814815,
  6.707407407407407,
  2.3988888888888886,
  0.18888888888888888,
  0.0,
  19.493888888888854,
  0.0,
  0.0],
 [Timest

In [11]:
len(final)

141

In [None]:
#final

In [12]:
df = pd.DataFrame(data = final, columns = ['Date','Temp_Max', 'Temp_Avg', 'Temp_Min', 'Dew_Max', 'Dew_Avg', 'Dew_Min', 'Humid_Max', 'Humid_Avg','Humid_Min','Wind_Max','Wind_Avg','Wind_Min','Pressure_Max', 'Pressure_Avg','Pressure_Min', 'Precipitation_Total'])

# CSV File is generated here 

In [13]:
df.to_csv('climate_Banglore_540.csv') 

In [None]:
#for row2 in pd.date_range(end=row1, periods= days_required):
        #temp = climate[climate['Date Format'] == row2]
        #print(row2)
        '''Temp_Avg += int(temp['Temp_Avg'])
        Dew_Avg += int(temp['Dew_Avg'])
        Humid_Avg += int(temp['Humid_Avg'])
        Wind_Avg += int(temp['Wind_Avg']) 
        Pressure_Avg += int(temp['Pressure_Avg'])
        Precipitation_Total += int(temp['Precipitation_Total'])
    final.append([row1,Temp_Avg / month_required, Dew_Avg/month_required, Humid_Avg/month_required, Wind_Avg/month_required, Pressure_Avg/month_required,Precipitation_Total/month_required])'''
'''for row1 in vegetables['Price Date']:
    Temp_Avg, Dew_Avg,Humid_Avg,Wind_Avg,Pressure_Avg, Precipitation_Total = 0,0,0,0,0,0
    temp = climate[climate['Date Format'] == row1]
    print(temp)'''
'''
Temp_Max += temp['Temp_Max'].astype(float)
        Temp_Avg += temp['Temp_Avg'].astype(float)
        Temp_Min += temp['Temp_Min'].astype(float)
        
        Dew_Max += temp['Dew_Max'].astype(float)
        Dew_Avg += temp['Dew_Avg'].astype(float)
        Dew_Min += temp['Dew_Min'].astype(float)
        
        Humid_Max += temp['Humid_Max'].astype(float)
        Humid_Avg += temp['Humid_Avg'].astype(float)
        Humid_Min += temp['Humid_Min'].astype(float)
        
        Wind_Max += temp['Wind_Max'].astype(float)
        Wind_Avg += temp['Wind_Avg'].astype(float)
        Wind_Min += temp['Wind_Min'].astype(float)
        
        Pressure_Avg += temp['Pressure_Avg'].astype(float)
        
        Precipitation_Total += temp['Precipitation_Total'].astype(float)
    final.append([row1,Temp_Max/days_required,Temp_Avg / days_required, Temp_Min / days_required,Dew_Min/ days_required,Dew_Avg/ days_required, Dew_Min/ days_required, Humid_Max/ days_required,Humid_Avg/ days_required, Humid_Min/ days_required, Wind_Max/ days_required, Wind_Avg/ days_required, Wind_Min/ days_required, Pressure_Max/ days_required, Pressure_Avg/ days_required, Pressure_Min/ days_required, Precipitation_Total/ days_required])
    
Temp_Max += int(temp['Temp_Max'])
        Temp_Avg += int(temp['Temp_Avg'])
        Temp_Min += int(temp['Temp_Min'])
        
        Dew_Max += int(temp['Dew_Max'])
        Dew_Avg += int(temp['Dew_Avg'])
        Dew_Min += int(temp['Dew_Min'])
        
        Humid_Max += int(temp['Humid_Max'])
        Humid_Avg += int(temp['Humid_Avg'])
        Humid_Min += int(temp['Humid_Min'])
        
        Wind_Max += int(temp['Wind_Max'])
        Wind_Avg += int(temp['Wind_Avg'])
        Wind_Min += int(temp['Wind_Min'])
        
        Pressure_Avg += int(temp['Pressure_Avg'])
        
        Precipitation_Total += int(temp['Precipitation_Total'])
'''