In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In this notebook I am going to attempt to perform a bivariate regression analysis on weather. Dataset contains the basic weather readings like temperature, wind speed, pressure, and weather condition.  

The goal is to build a regression model to predict Temperature data from Humidity.

In [None]:
data  = pd.read_csv('../input/weatherHistory.csv')

### 1) Data Exploration
Let’s first explore the data and gain as much insight about the data before doing any analysis.
This dataset has 12 columns with 96,453 rows of data.As it can be seen in the following two lines of code, dataset contains both numeric and categorical data.

In [None]:
data.head()

In [None]:
data.shape

Some columns contain no data such as Loud Cover and it should be removed from dataset however since I am only going to use Humidity and temperature data I am going to ignore it for now.

In [None]:
data.describe()

The next step is to check for null cells and If any row contains null we have to drop the row.

In [None]:
data.isna().any()

In [None]:
data=data.dropna()

### 2) Regression Modeling
In this section we are going to build a regression model that predicts Temperature from humidity data.
Before performing any modeling, it is a good practice to check for collinearity and correlation between the features. This can be done with heatmap of correlations as demonstrated below.


In [None]:
modeling_data=data.copy()
modeling_data=modeling_data.drop(['Daily Summary','Loud Cover'], axis=1)
le = LabelEncoder()
modeling_data['Summary']=le.fit(modeling_data['Summary']).transform(modeling_data['Summary'])
le2 = LabelEncoder()
modeling_data['Precip Type']=le2.fit(modeling_data['Precip Type']).transform(modeling_data['Precip Type'])

In [None]:
corr = modeling_data.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(0, 150, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap,  center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Several observations can be inferred from correlation heatmap analysis:
* Temperature and Apparent Temperature data have similar data since the correlation between them are so high
* Pressure data has a very poor correlation to temperature data and this can be because of existence of rows with zero pressure readings
* Temperature and humidity have inverse correlation


In [None]:
modeling_data=modeling_data.drop(['Apparent Temperature (C)','Formatted Date','Summary',],axis=1)
modeling_data=modeling_data[modeling_data['Humidity']>0]

Now we can attempt to build regression model. I have split the data to train and test data below and fitted linear model obtained from sklearn package.

In [None]:
X_train, X_test, y_train, y_test = train_test_split( modeling_data['Humidity'], 
                                                      modeling_data['Temperature (C)'], 
                                                      test_size=0.33, random_state=42)

In [None]:
reg = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
reg.fit(X_train.values.reshape(-1, 1),y_train.values.reshape(-1, 1))
reg.coef_

In [None]:
print ('In sample regression score: ' + str(reg.score(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))))

In [None]:
print ('Out of sample regression score: ' + str(reg.score(X_test.values.reshape(-1, 1), y_test.values.reshape(-1, 1)))) 

In [None]:
font = {'size'   : 20}
plt.rc('font', **font)
plt.figure(figsize=(13,10))
plt.plot(modeling_data['Humidity'],modeling_data['Temperature (C)'],'o',label='Data')
I=np.linspace(np.floor(min(modeling_data['Humidity'])*0.95),np.ceil(max(modeling_data['Humidity'])*0.11),50)
plt.plot(I,reg.predict(I.reshape(-1, 1)),color='r', linewidth=3,label='Regression Line')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('Humidity');plt.ylabel('Temperature (C)')
Preds=reg.predict( modeling_data['Humidity'].values.reshape(-1, 1))
R2=r2_score(modeling_data['Temperature (C)'],Preds )
plt.title('R2: '+ str(np.round(R2,decimals=3)))
plt.show()

Looking at the regression model result above, one should expect the prediction results not to have high accuracy because the relationship between Humidity and Temperature itself is fuzzy since the temperature depends on more parameters than just humidity. In the following section of the code we can see residuals of the regression

In [None]:
Residuals=modeling_data['Temperature (C)'].values.reshape(-1, 1)-Preds
font = {'size'   : 20}
plt.rc('font', **font)
fig, ax = plt.subplots(1,2,figsize=(20,10))
num_bins = 50
n, bins, patches = ax[0].hist(Residuals, num_bins, density=1)
ax[0].title.set_text('Regression Residuals');


ax[1].plot(modeling_data['Humidity'],Residuals,'o')
ax[1].set(xlabel='Fitted Value', ylabel='Residual Value')
ax[1].hlines(0, np.min(modeling_data['Humidity'])*0.95, np.max(modeling_data['Humidity'])*1.1, colors='r', linestyles='solid',zorder=10, linewidth=4 )
plt.show()

In [None]:
!pip install plotly_express 

In [None]:
import plotly_express as px

In [None]:
iris = px.data.iris()

In [None]:
px.scatter(iris, x="sepal_width", y="sepal_length")