#Linear Regression

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# read csv file
data = pd.read_csv("data/weatherHistory.csv")
# print first 10 values
print(data.head(10))
# print the feature data type
print(data.dtypes)

Clean the data off any zero and NaN

In [None]:
# processing data removing zero & NaN values
data=data.replace({'Humidity':0, 'Pressure (millibars)':0 },np.nan)
data=data.dropna()

In [None]:
pressure = data['Pressure (millibars)']
humidity = data['Humidity']
temperature = data['Temperature (C)']
wind_speed = data['Wind Speed (km/h)']
visibility = data['Visibility (km)']

plt.figure(figsize=(10,10))
ax = sns.scatterplot(x=temperature, y=humidity)
plt.show()

Visualise Pressure vs Humidity

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x=humidity, y=pressure)
plt.show()

Visualise  Pressure vs Temperature

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x=humidity, y=pressure)
plt.show()

Visualise correlation between variables.

In [None]:
import seaborn as sns
snsplot = sns.heatmap(data.corr(numeric_only=True), annot=True,linewidths=0.5, fmt='.1f')
#snsplot.figure.savefig("corr_data.png")

plt.show()

Split Data for months

In [None]:
import datetime
# Complete the call to convert the date column
data['Formatted Date'] =  pd.to_datetime(data['Formatted Date'],utc=True,
                              format='%Y-%m-%d %H:%M:%S.%f %z')

# Confirm the date column is in datetime format
print(data.info())

In [None]:
print(data['Formatted Date'][10].month)

print(data['Formatted Date'][0].day)
print(data['Formatted Date'][0].month)
print(data['Formatted Date'][0].year)

data['month'] = data['Formatted Date'].dt.month

data.head()

data_jan = data[data.month == 1]
data_feb = data[data.month == 2]
data_mar = data[data.month == 3]
data_apr = data[data.month == 4]
data_may = data[data.month == 5]
data_jun = data[data.month == 6]
data_jul = data[data.month == 7]
data_aug = data[data.month == 8]
data_sep = data[data.month == 9]
data_oct = data[data.month == 10]
data_nov = data[data.month == 11]
data_dec = data[data.month == 12]

Use January for data analysis



In [None]:
snsplot = sns.heatmap(data_jan.corr(numeric_only=True), annot=True,linewidths=0.5, fmt='.1f')
#snsplot.figure.savefig("corr_data.png")

plt.show()

Draw scatter plot of the relationship

In [None]:
pressure_sep = data_sep['Pressure (millibars)']
humidity_sep = data_sep['Humidity']
temperature_sep = data_sep['Temperature (C)']

plt.figure(figsize=(10,10))
ax = sns.scatterplot(x=temperature_sep, y=humidity_sep)
plt.show()

Regression analysis using scikit learn library

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Set the features and target for our regression model.

In [None]:
features_available = [
    'Temperature (C)',
]
X = data[features_available]
y = data['Humidity']

Dividing data into Training and test set



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

Print out the shape

In [None]:
#yourcodehere
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Create a linear regression object

In [None]:
linreg = LinearRegression()

Training of the model using the training dataset

In [None]:
linreg.fit(X_train,y_train) #train linear regression model

Print out the intercept and gradient of the line

In [None]:
#To retrieve the intercept:
print(linreg.intercept_)
#For retrieving the slope:
print(linreg.coef_)

Make some predictions

In [None]:
y_pred = linreg.predict(X_test)

Comparing the actual and predicted value of humidity



In [None]:
df = pd.DataFrame(columns=['Actual', 'Predicted'])

df['Actual'] = y_test
df['Predicted'] = y_pred

Draw a graph to show our dataset with the best fit line

In [None]:
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()

Evaluating our model

In [None]:
print ("Linear Regression Score")
predict = linreg.predict(X_test)
print ("Mean Absolute Error: ", mean_absolute_error(y_test,y_pred))
print ("Mean Squared Error: ", mean_squared_error(y_test,y_pred))
print ("R2: ", r2_score(y_test,y_pred))