In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import sklearn
import warnings
warnings.filterwarnings("ignore")

In [None]:
#importing the dataset
df=pd.read_csv("temperature.csv")
df

In [None]:
print("There are",df.shape[0],"rows and",df.shape[1],"columns in the dataset.")

In [None]:
#Checking null values
df.isnull().sum()

In [None]:
df.isnull()

In [None]:
df.isnull().sum().sum()

So there are total 1248 null values in the dataset.

In [None]:
#visualizing the null values
plt.figure(figsize=(20,25))
sns.heatmap(df.isnull())
plt.show()

In [None]:
print("As it is a very large dataset having",df.shape[0],"rows, dropping",df.isnull().sum().sum(),"will not effect the data.")

In [None]:
print("Percentage of loss if we drop the null values =",(df.isnull().sum().sum())/df.shape[0]*100)

So we can consider dropping the NaN values.

In [None]:
df_new=df.dropna(inplace=True)

In [None]:
df.isnull().sum()

Now we can see there are no null values in the dataset.

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#Checking for duplicate values
df.duplicated().sum()

# Description

In [None]:
df.info()

As we can see only object type data is Date columns, so if we change it as index column we can predict better.

In [None]:
df.index=df['Date']
df.drop('Date',axis=1,inplace=True)          #Dropped so that there will be no data duplicacy
df.head()

In [None]:
df.describe()

From the above chart we can say there are no null values in the dataset.

There is skewness present in the dataset.

There are outliers present in the dataset.

# Visualization

In [None]:
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=25:
        plt.subplot(5,5,p)
        sns.distplot(df[i],color='red')
        plt.xlabel(i,fontsize=15)
    p+=1
plt.tight_layout()

In [None]:
sns.scatterplot(data=df,x='Present_Tmax',y='Next_Tmax',palette='Setr_2')

In [None]:
sns.lmplot(data=df,y='LDAPS_PPT3',x='Next_Tmax',palette='pink')

In [None]:
#Outliers Detection
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=25:
        plt.subplot(5,5,p)
        sns.boxplot(df[i],color='green')
        plt.xlabel(i,fontsize=15)
    p+=1
plt.tight_layout()

we can see there are lots of outliers present in the dataset. As our target columns are 'Next_Tmax' and 'Next_Tmin',apart from these two columns we will remove outliers from other columns.

In [None]:
#Removing outliers
from scipy.stats import zscore
outliers=df.iloc[:,0:-2]
z=np.abs(zscore(outliers))
z

In [None]:
threshold=3
print(np.where(z>3))

In [None]:
df_new=df[(z<3).all(axis=1)]
df_new

Lets check data loss after removing outliers.

In [None]:
print("Percentage of data loss =",(df.shape[0]-df_new.shape[0])/df.shape[0]*100)

Data loss is 11%, which is acceptable as it is a big dataset. So lets continue with the new dataset.

In [None]:
df=df_new
df.head()

# Skewness

In [None]:
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=25:
        plt.subplot(5,5,p)
        sns.distplot(df[i],color='brown')
        plt.xlabel(i,fontsize=15)
    p+=1
plt.tight_layout()

From the above boxplot chart we can see that there were huge outliers present in the columns :LDAPS_PPT1, LDAPS_PPT2, LDAPS_PPT3, LDAPS_PPT4, and now also we can see that these four much are having extreme skewness, so we can conclude that there is no proper analysis done on these columns, so lets drop these 4 columns.

In [None]:
df.drop(['LDAPS_PPT1','LDAPS_PPT2','LDAPS_PPT3','LDAPS_PPT4'],axis=1,inplace=True)
df    

In [None]:
#Lets visualize again
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=20:
        plt.subplot(5,4,p)
        sns.distplot(df[i],color='m')
        plt.xlabel(i,fontsize=15)
    p+=1
plt.tight_layout()

In [None]:
#Removing the skewness
from scipy.stats import boxcox
skewed=df[['LDAPS_RHmax','LDAPS_WS','LDAPS_LH','LDAPS_CC1','LDAPS_CC2','LDAPS_CC3','LDAPS_CC4','lon','DEM','Slope',
          'Solar radiation']]
for i in skewed:
    df[i]=boxcox(df[i],0)
df.skew()

We can see some columns are having NaN values. Lets drop them and proceed further.

In [None]:
df.drop(['LDAPS_CC1','LDAPS_CC2','LDAPS_CC3','LDAPS_CC4'],axis=1,inplace=True)
df

In [None]:
#checking for any NaN value present in dataset
df.isnull().sum()

As we can see there is 1 null value present in the dataset. Lets remove it and proceed further

In [None]:
df.dropna(inplace=True)

In [None]:
df.corr()

In [None]:
#Visualizing Correlation
plt.figure (figsize=(20,25))
sns.heatmap(df.corr(),annot=True,cmap='Dark2',linewidth=0.3)
plt.show()

In [None]:
#Separating feature and target variable
x=df.iloc[:,0:-2]
x.head()

In [None]:
y1=df[['Next_Tmin']]
y1.head()

In [None]:
y2=df[['Next_Tmax']]
y2.head()

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#Feature Scaling
st=StandardScaler()
xf=st.fit_transform(x)
xf

In [None]:
x=pd.DataFrame(data=xf,columns=x.columns)
x.head()

# Checking Multi-Collineraity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif=pd.DataFrame()
vif['VIF_values']=[variance_inflation_factor(x.values,i)for i in range(len(x.columns))]
vif['Features']=x.columns
vif

# Modelling

In [None]:
#Importing necessary library
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Modelling for Y1(Next_Tmin)

In [None]:
#creating train_test_split
x_train,x_test,y1_train,y1_test=train_test_split(x,y1,test_size=0.33,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y1_train.shape)
print(y1_test.shape)

In [None]:
lr=LinearRegression()
lr.fit(x_train,y1_train)
lr_pred=lr.predict(x_test)
print("R2 score =",r2_score(y1_test,lr_pred))
print("Mean Absolute Error =",mean_absolute_error(y1_test,lr_pred))
print("Mean Squared Error =",mean_squared_error(y1_test,lr_pred))

In [None]:
rfr=RandomForestRegressor()
rfr.fit(x_train,y1_train)
rfr_pred=rfr.predict(x_test)
print("R2 score =",r2_score(y1_test,rfr_pred))
print("Mean Absolute Error =",mean_absolute_error(y1_test,rfr_pred))
print("Mean Squared Error =",mean_squared_error(y1_test,rfr_pred))

In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y1_train)
dtr_pred=dtr.predict(x_test)
print("R2 score =",r2_score(y1_test,dtr_pred))
print("Mean Absolute Error =",mean_absolute_error(y1_test,dtr_pred))
print("Mean Squared Error =",mean_squared_error(y1_test,dtr_pred))

In [None]:
knr.fit(x_train,y1_train)
knr_pred=knr.predict(x_test)
print("R2 score =",r2_score(y1_test,knr_pred))
print("Mean Absolute Error =",mean_absolute_error(y1_test,knr_pred))
print("Mean Squared Error =",mean_squared_error(y1_test,knr_pred))

In [None]:
sv=SVR()
sv.fit(x_train,y1_train)
sv_pred=sv.predict(x_test)
print("R2 score =",r2_score(y1_test,sv_pred))
print("Mean Absolute Error =",mean_absolute_error(y1_test,sv_pred))
print("Mean Squared Error =",mean_squared_error(y1_test,sv_pred))

# Cross_Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
rfr_score=cross_val_score(rfr,x,y1)
print(rfr_score)
print(rfr_score.mean())
print("The difference between cv score and r2 score =",(r2_score(y1_test,rfr_pred))-(rfr_score.mean()))

In [None]:
dtr_score=cross_val_score(dtr,x,y1)
print(dtr_score)
print(dtr_score.mean())
print("The difference between cv score and r2 score =",(r2_score(y1_test,dtr_pred))-(dfr_score.mean()))

In [None]:
knr_score=cross_val_score(knr,x,y1)
print(knr_score)
print(knr_score.mean())
print("The difference between cv score and r2 score =",(r2_score(y1_test,knr_pred))-(knr_score.mean()))

# Hyper_Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
dtr=DecisionTreeRegressor()
parameters={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],'splitter':['best','random']}
grid=GridSearchCV(estimator=dtr,param_grid=parameters)
grid.fit(x,y1)
print("SCORE =",grid.best_score_)
print(grid.best_params_)

In [None]:
#Applying the hyper parameter tuning on the model
dtr=DecisionTreeRegressor(criterion= 'squared_error', splitter= 'best')
dtr.fit(x_train,y1_train)
pred=dtr.predict(x_test)
print("R2 SCORE =",r2_score(y1_test,pred))

In [None]:
rfr=RandomForestRegressor()
parameters={'n_estimators':[120,130,240],'criterion':['absolute_error','friedman_mse','poisson']}
grid=GridSearchCV(estimator=rfr,param_grid=parameters)
grid.fit(x,y1)
print("SCORE =",grid.best_score_)
print(grid.best_params_)

In [None]:
#Applying the hyper parameter tuning on the model
rfr=RandomForestRegressor(n_estimators=120,criterion='friedman_mse')
rfr.fit(x_train,y1_train)
pred=rfr.predict(x_test)
print("R2 score =",r2_score(y1_test,pred))

So we can see there is a increase in accuracy score after applying hyper tuning parameters.

In [None]:
final_model=rfr
final_model

# Saving the best model and predicting the saved model

In [None]:
import joblib
joblib.dump(final_model,"Minimum temp.pkl")

In [None]:
model=joblib.load("Minimum temp.pkl")
prediction=model.predict(x_test)
prediction

In [None]:
actual=np.array(y1_test)
actual.reshape(-1,1)
actual

# Modelling Y2(Next_Tmax)

In [None]:
#creating train_test_split
x_train,x_test,y2_train,y2_test=train_test_split(x,y2,test_size=0.33,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y2_train.shape)
print(y2_test.shape)

In [None]:
lr=LinearRegression()
lr.fit(x_train,y2_train)
lr_pred=lr.predict(x_test)
print("R2 score =",r2_score(y2_test,lr_pred))
print("Mean Absolute Error =",mean_absolute_error(y2_test,lr_pred))
print("Mean Squared Error =",mean_squared_error(y2_test,lr_pred))

In [None]:
rfr=RandomForestRegressor()
rfr.fit(x_train,y2_train)
rfr_pred=rfr.predict(x_test)
print("R2 score =",r2_score(y2_test,rfr_pred))
print("Mean Absolute Error =",mean_absolute_error(y2_test,rfr_pred))
print("Mean Squared Error =",mean_squared_error(y2_test,rfr_pred))

In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y2_train)
dtr_pred=dtr.predict(x_test)
print("R2 score =",r2_score(y2_test,dtr_pred))
print("Mean Absolute Error =",mean_absolute_error(y2_test,dtr_pred))
print("Mean Squared Error =",mean_squared_error(y2_test,dtr_pred))

# Cross_Validation

In [None]:
rfr_score=cross_val_score(rfr,x,y2)
print(rfr_score)
print(rfr_score.mean())
print("The difference between cv score and r2 score is =",(r2_score(y2_test,rfr_pred))-rfr_score.mean())

In [None]:
dtr_score=cross_val_score(dtr,x,y2)
print(dtr_score)
print(dtr_score.mean())
print("The difference between cv score and r2 score is =",(r2_score(y2_test,dtr_pred))-dtr_score.mean())

# Hyper_Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model=rfr
{'n_estimators':[120,130,240],'criterion':['absolute_error','friedman_mse','poisson']}
grid=GridSearchCV(estimator=rfr,param_grid=parameters)
grid.fit(x,y1)
print("SCORE =",grid.best_score_)
print(grid.best_params_)

In [96]:
#Applying the hyper parameter tuning on the model
rfr=RandomForestRegressor(n_estimators=120,criterion='friedman_mse')
rfr.fit(x_train,y1_train)
pred=rfr.predict(x_test)
print("R2 score =",r2_score(y1_test,pred))

R2 score = 0.8647305187090617


In [97]:
final_model=rfr
final_model

RandomForestRegressor(criterion='friedman_mse', n_estimators=120)

# Saving the best model and predicting saved model

In [99]:
import joblib
joblib.dump(final_model,"Maximum Temp.pkl")

['Maximum Temp.pkl']

In [101]:
model=joblib.load("Maximum Temp.pkl")
prediction=model.predict(x_test)
prediction

array([25.86      , 21.32583333, 22.44166667, ..., 24.0525    ,
       24.97666667, 23.77416667])

In [108]:
actual=np.array(y2_test)
actual.reshape(1,-1)

array([[35.1, 29.7, 34.9, ..., 32.7, 35.1, 27.4]])

We can see both are models are performing well.