# Weather Dataset - Temperature Prediction

In [None]:
# Importing the necessary modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from statsmodels.graphics.gofplots import qqplot
import statistics 
import scipy
import math
import seaborn as sns
%matplotlib inline

In [None]:
# Importing dataset into data frame variable
df = pd.read_csv("WeatherHistoryDataset.csv")

#printing the first 5 rows of the dataset
df.head()

In [None]:
# Finding out the general information about the dataset
df.describe()

In [None]:
# Since the NaN values are represented as " " it has to be converted to NaN value so that we can clean the data efficiently
df = df.replace(" ", np.nan)

In [None]:
# Counting the number of NaN values through each column
df.isnull().sum()

# Exploratory Data Analysis:

### _Description_: 

- **Formatted Date**: Date in yyyy-mm-dd hr(in 24 hr format) format.
- **Summary**: Summary of weather.
- **Precip Type**: Type of precipitation.
- **Temperature**: Temperature in degrees Centigrade.
- **Apparent Temperature Â©**: Apparent temperature in degrees Centigrade.
- **humidity**: Humidity at recorded time.
- **Wind Speed**: Wind speed in km/hrs.
- **Wind Bearing**: Wind Bearing in degrees.
- **Visibility**: Visibility in km.
- **Loud Cover**: No useful information that can be made out.
- **Pressure**: Pressure in millibars.
- **Daily Summary**: Present day's summary. 

In [None]:
# Removing rows where the Precip type and Summary is NaN

df = df[df["Precip Type"].notna()]
df = df[df["Summary"].notna()]

In [None]:
# Dropping the Loud Cover column as it does not contain useful information

df.drop('Loud Cover', inplace=True, axis=1)

In [None]:
# Preparing the Label Encoder

le = preprocessing.LabelEncoder()

# Encoding the values of Precip Type and Summary to unique values so that we can use them for analysis

df["Precip Type"] = le.fit_transform(df["Precip Type"])
df["Summary"] = le.fit_transform(df["Summary"])

In [None]:
# Imputing the NaN values with the mean strategy 

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df.iloc[:,4:10].values)
df.iloc[:,4:10] = imputer.transform(df.iloc[:,4:10].values)

In [None]:
# Removing the duplicate rows

df.drop_duplicates(subset ="Formatted Date", keep = False, inplace = True)

## Removal of outliers

In [None]:
import seaborn as sns

# Visualizing the data in boxplots to get information on the outliers
sns.boxplot(x=df['Temperature (C)'])    

In [None]:
# Removal of outliers 
Q1 = df['Temperature (C)'].quantile(0.25)   #first quartile
Q3 = df['Temperature (C)'].quantile(0.75)   #third quartile
IQR = Q3 - Q1   #IQR-> Interquartile Range

filter = (df['Temperature (C)'] >= Q1 - 1.5 * IQR) & (df['Temperature (C)'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Temperature (C)'])    

In [None]:
sns.boxplot(x=df['Apparent Temperature ©'])

In [None]:
Q1 = df['Apparent Temperature ©'].quantile(0.25)
Q3 = df['Apparent Temperature ©'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Apparent Temperature ©'] >= Q1 - 1.5 * IQR) & (df['Apparent Temperature ©'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Apparent Temperature ©'])

In [None]:
sns.boxplot(x=df['Humidity'])

In [None]:
Q1 = df['Humidity'].quantile(0.25)
Q3 = df['Humidity'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Humidity'] >= Q1 - 1.5 * IQR) & (df['Humidity'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Humidity'])

In [None]:
sns.boxplot(x=df['Wind Speed (km/h)'])

In [None]:
Q1 = df['Wind Speed (km/h)'].quantile(0.25)
Q3 = df['Wind Speed (km/h)'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Wind Speed (km/h)'] >= Q1 - 1.5 * IQR) & (df['Wind Speed (km/h)'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Wind Speed (km/h)'])

In [None]:
sns.boxplot(x=df['Wind Bearing (degrees)'])

In [None]:
Q1 = df['Wind Bearing (degrees)'].quantile(0.25)
Q3 = df['Wind Bearing (degrees)'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Wind Bearing (degrees)'] >= Q1 - 1.5 * IQR) & (df['Wind Bearing (degrees)'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Wind Bearing (degrees)'])

In [None]:
sns.boxplot(x=df['Visibility (km)'])

In [None]:
Q1 = df['Visibility (km)'].quantile(0.25)
Q3 = df['Visibility (km)'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Visibility (km)'] >= Q1 - 1.5 * IQR) & (df['Visibility (km)'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  

In [None]:
sns.boxplot(x=df['Visibility (km)'])

In [None]:
df['Pressure (millibars)']=df['Pressure (millibars)'].astype(float)   #converting to float type

In [None]:
sns.boxplot(x=df['Pressure (millibars)'])

In [None]:
Q1 = df['Pressure (millibars)'].quantile(0.25)
Q3 = df['Pressure (millibars)'].quantile(0.75)
IQR = Q3 - Q1     

filter = (df['Pressure (millibars)'] >= Q1 - 1.5 * IQR) & (df['Pressure (millibars)'] <= Q3 + 1.5 *IQR)
df = df.loc[filter]  


In [None]:
sns.boxplot(x=df['Pressure (millibars)'])

In [None]:
# outliers removed
df

# Graph Visualisation - BAR GRAPH

### 1. Temperature vs Date

In [None]:
plt.bar(x=df['Formatted Date'], height=df['Temperature (C)'])  

### 2. Humidatity vs Date

In [None]:
plt.bar(x=df['Formatted Date'], height=df['Humidity'])

### 3. Visibility vs precip type

In [None]:
plt.bar(x=df['Precip Type'], height=df['Visibility (km)'])

### 4. Wind speed vs Summary

In [None]:
plt.bar(x=df['Summary'], height=df['Wind Speed (km/h)'])

### 5. Visibility vs Summary

In [None]:
plt.bar(x=df['Summary'], height=df['Visibility (km)'])

# Normalization and Standardization

In [None]:
# Calculating mean of all required columns
df[['Temperature (C)', 'Apparent Temperature ©', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']].mean()

In [None]:
# Calculating variance of all required columns
df[['Temperature (C)','Apparent Temperature ©', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']].var()

In [None]:
# Data before normalization

In [None]:
plt.hist(df['Temperature (C)'], 20)
plt.xlabel('Temperature (C)')
plt.show()

In [None]:
plt.hist(df['Apparent Temperature ©'], 20)
plt.xlabel('Apparent Temperature ©')
plt.show()

In [None]:
plt.hist(df['Humidity'], 20)
plt.xlabel('Humidity')
plt.show()

In [None]:
plt.hist(df['Pressure (millibars)'], 20)
plt.xlabel('Pressure (millibars)')
plt.show()

## Need for Normalization and it's effects on the Dataset

    -> In the context of machine learning and data science, normalization takes the values from the database and where they are numeric columns, changes them into a common scale. 
    -> The main benefits of normalization in analytical terms are that it allows faster searching and sorting as it is better at creating indexes via smaller, logical tables. 
    -> Also, in having more tables, there is a better use of segments to control the tangible placement of data. There will be fewer nulls and redundant data after modelling any necessary columns and bias/issues with anomalies are greatly reduced by removing the differences in scale. 
    -> In summary, data normalization processes ensure that our data is structured logically and scaled proportionally where required, generally on a scale of 0 to 1. It tends to be used where you have predefined assumptions of your model. 
    -> By ensuring you have normalized data, the likelihood of success in your machine learning and data science projects vastly improves. 
    -> It is vital that organizations invest as much in ensuring the quality of their data as they do in the analytical and scientific models that are created by it. Preparation is everything in a successful data strategy.

In [None]:
cols = ['Temperature (C)', 'Apparent Temperature ©', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']

# Create x, where x is the values of cols as floats
x = df[cols].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_normalized = pd.DataFrame(x_scaled)

df_normalized.columns = ['Temperature (C)', 'Apparent Temperature ©', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
df_normalized

In [None]:
# Function to check the normalized mean (to zero - 0) and variance (to one - 1)

def norm (sample):
    mean = sample.mean()
    std = np.std(sample)
    calc = (sample - mean)/std
    print(f"df_normalized[{col}].mean() = {round(calc.mean())}")
    print(f"df_normalized[{col}].var() = {round(np.std(calc))}\n")

In [None]:
# Checking for the normalized mean (to zero - 0) and variance (to one - 1)

for col in cols:
    norm(df_normalized[col]);

In [None]:
# Data after normalization

In [None]:
_, bins, _ = plt.hist(df_normalized['Temperature (C)'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Temperature (C)'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Temperature (C)')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Temperature (C)'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Apparent Temperature ©'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Apparent Temperature ©'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Apparent Temperature ©')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Apparent Temperature ©'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Humidity'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Humidity'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Humidity')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Humidity'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Wind Speed (km/h)'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Wind Speed (km/h)'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Wind Speed (km/h)')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Wind Speed (km/h)'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Wind Bearing (degrees)'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Wind Bearing (degrees)'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Wind Bearing (degrees)')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Wind Bearing (degrees)'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Visibility (km)'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Visibility (km)'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Visibility (km)')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Visibility (km)'], line = 's')
plt.show()

In [None]:
_, bins, _ = plt.hist(df_normalized['Pressure (millibars)'], 20, density=1)
mu, sigma = scipy.stats.norm.fit(df_normalized['Pressure (millibars)'])
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
plt.xlabel('Pressure (millibars)')
plt.plot(bins, best_fit_line)

In [None]:
qqplot(df_normalized['Pressure (millibars)'], line = 's')
plt.show()

## Testing of Hypothesis:

### 1. For Column Temperature (C)

In [None]:
# Find sample mean and sample standard deviation
number_of_values=len(df)            # Number of values in the column
Sample_data=df["Temperature (C)"]

sample_mean=statistics.mean(Sample_data) 
sample_sd=statistics.stdev(Sample_data) 

# Hypothesis
# Null Hypothesis
# H0: The mean Temparature is 12 degree C (mu=12)
H0="The mean Temparature is 12 degree C (mu=12)"

# Alternate Hypothesis
# H1: The mean Temparature is not equal to 12 degree C (mu != 12)
H1="The mean Temparature is not equal to 12 degree C (mu != 12)"

population_mean_from_hypothesis=12

# Determining if the test is one tailed or two tailed and alloting alpha value

test="two_tailed_test"
if(test=="two_tailed_test"):
    number_of_tails=2
    alpha = 0.025
elif(test=="one_tailed_test"):
    number_of_tails=1
    alpha =0.05

# Z score
z_score=(sample_mean-population_mean_from_hypothesis)/(sample_sd/np.sqrt(number_of_values))
# p Value
p_value = scipy.stats.norm.sf(z_score) 

if p_value > alpha:
    print('Null hypothesis accepted')
    print('Accepted hypothesis is H0: ',H0)

else:
    print('Null hypothesis is rejected and alternate hypothesis is accepted')
    print('Accepted hypothesis is H1:',H1)

print('z_score=%.10f' % (z_score))
print('p_value=%.10f' % (p_value))
print('sample_mean=%.10f' % (sample_mean))
print('sample_sd=%.10f' % (sample_sd))
print('The test is '+str(number_of_tails)+' tailed test')


### 2. For Column Temperature (C)

In [None]:
# Find sample mean and sample standard deviation
number_of_values=len(df)            # Number of values in the column
Sample_data=df["Temperature (C)"]

sample_mean=statistics.mean(Sample_data) 
sample_sd=statistics.stdev(Sample_data) 

# Hypothesis
# Null Hypothesis
# H0: The mean Temparature is greater than or equal to 14 degree C (mu >= 14)
H0="The mean Temparature is greater than or equal to 14 degree C (mu>=14)"

# Alternate Hypothesis
# H1: The mean Temparature is less than 14 degree C (mu < 14)
H1="The mean Temparature is less than 14 degree C (mu < 14)"

population_mean_from_hypothesis=14

# Determining if the test is one tailed or two tailed and alloting alpha value

test="one_tailed_test"
if(test=="two_tailed_test"):
    number_of_tails=2
    alpha = 0.025
elif(test=="one_tailed_test"):
    number_of_tails=1
    alpha =0.05

# Z score
z_score=(sample_mean-population_mean_from_hypothesis)/(sample_sd/np.sqrt(number_of_values))
# p Value
p_value = scipy.stats.norm.sf(z_score) 

if p_value > alpha:
    print('Null hypothesis accepted')
    print('Accepted hypothesis is H0: ',H0)

else:
    print('Null hypothesis is rejected and alternate hypothesis is accepted')
    print('Accepted hypothesis is H1:',H1)

print('z_score=%.10f' % (z_score))
print('p_value=%.10f' % (p_value))
print('sample_mean=%.10f' % (sample_mean))
print('sample_sd=%.10f' % (sample_sd))
print('The test is '+str(number_of_tails)+' tailed test')


### 3. For Apparent Temperature ©

In [None]:
# Find sample mean and sample standard deviation
number_of_values=len(df)            # Number of values in the column
Sample_data=df["Apparent Temperature ©"]

sample_mean=statistics.mean(Sample_data) 
sample_sd=statistics.stdev(Sample_data) 

# Hypothesis
# Null Hypothesis
# H0: The mean Apparent Temperature © is 11.5 degree © (mu=11.5)
H0="The mean Apparent Temperature © is 11.5 degree © (mu=11.5)"

# Alternate Hypothesis
# H1: The mean Apparent Temperature © is not equal to 11.5 degree © (mu != 11.5)
H1="The mean Apparent Temperature © is not equal to 11.5 degree © (mu != 11.5)"

population_mean_from_hypothesis=11.5

# Determining if the test is one tailed or two tailed and alloting alpha value

test="two_tailed_test"
if(test=="two_tailed_test"):
    number_of_tails=2
    alpha = 0.025
elif(test=="one_tailed_test"):
    number_of_tails=1
    alpha =0.05

# Z score
z_score=(sample_mean-population_mean_from_hypothesis)/(sample_sd/np.sqrt(number_of_values))
# p Value
p_value = scipy.stats.norm.sf(z_score) 

if p_value > alpha:
    print('Null hypothesis accepted')
    print('Accepted hypothesis is H0: ',H0)

else:
    print('Null hypothesis is rejected and alternate hypothesis is accepted')
    print('Accepted hypothesis is H1:',H1)
    
print('z_score=%.10f' % (z_score))
print('p_value=%.10f' % (p_value))
print('sample_mean=%.10f' % (sample_mean))
print('sample_sd=%.10f' % (sample_sd))
print('The test is '+str(number_of_tails)+' tailed test')


### 4. For Humidity

In [None]:
# Find sample mean and sample standard deviation
number_of_values=len(df)            # Number of values in the column
Sample_data=df["Humidity"]

sample_mean=statistics.mean(Sample_data) 
sample_sd=statistics.stdev(Sample_data) 

# Hypothesis
# Null Hypothesis
# H0: The mean Humidity is equal to 0.8  (mu=11.5)
H0="The mean Humidity is equal to 0.8  (mu=11.5)"

# Alternate Hypothesis
# H1: The mean Humidity is not equal to 0.8  (mu=11.5)
H1="The mean Humidity is not equal to 0.8  (mu=11.5)"

population_mean_from_hypothesis=0.8

# Determining if the test is one tailed or two tailed and alloting alpha value

test="two_tailed_test"
if(test=="two_tailed_test"):
    number_of_tails=2
    alpha = 0.025
elif(test=="one_tailed_test"):
    number_of_tails=1
    alpha =0.05

# Z score
z_score=(sample_mean-population_mean_from_hypothesis)/(sample_sd/np.sqrt(number_of_values))
# p Value
p_value = scipy.stats.norm.sf(z_score) 

if p_value > alpha:
    print('Null hypothesis accepted')
    print('Accepted hypothesis is H0: ',H0)

else:
    print('Null hypothesis is rejected and alternate hypothesis is accepted')
    print('Accepted hypothesis is H1:',H1)
    
print('z_score=%.10f' % (z_score))
print('p_value=%.10f' % (p_value))
print('sample_mean=%.10f' % (sample_mean))
print('sample_sd=%.10f' % (sample_sd))
print('The test is '+str(number_of_tails)+' tailed test')


# Correlation Test - Pearson Correlation

In [None]:
# Creating a function to find out if samples are correlated or not

def corr_test_Pearson(FirstSample,SecondSample):
    
    from scipy.stats import pearsonr
    correlation_const, p_value = pearsonr(FirstSample, SecondSample)

    # Conclusions on the data if they are dependent or independent using p-value
    print('correlation coefficient=%.5f, p=%5f' % (correlation_const, p_value))
    if p_value > 0.05:
        print('independent samples')
    else:
        print('dependent samples')

    # Conclusions on correlation constant
    if(correlation_const>0.6 and p_value < 0.05):
        print("The given First and Second sample are positively correlated.")
    elif(correlation_const<-0.6 and p_value < 0.05):
        print("The given First and Second sample are negatively correlated.")
    else:
        print("The given First and Second sample are not correlated.")

### 1.Correlation test on Temperature (C) and Apparent Temperature ©

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Apparent Temperature ©'])

### 2.Correlation test on Temperature (C) and Humidity

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Humidity'])

### 3.Correlation test on Temperature (C) and Wind Speed (km/h)

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Wind Speed (km/h)'])

### 4.Correlation test on Temperature (C) and Wind Bearing (degrees)

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Wind Bearing (degrees)'])

### 5.Correlation test on Temperature (C) and Visibility (km)	

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Visibility (km)'])

### 6.Correlation test on Temperature (C) and Pressure (millibars)

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Pressure (millibars)'])

### 7.Correlation test on Humidity and Wind Speed (km/h)

In [None]:
# Performing the check for correlation 
corr_test_Pearson(df['Temperature (C)'],df['Wind Speed (km/h)'])

### Correlation Graph

In [None]:
# Plotting the correlation graphs 

sns.pairplot(df)
plt.show()

## Feature Selection:

In [None]:
# Selecting the features that will help in prediction

features = ['Apparent Temperature ©','Humidity','Wind Speed (km/h)','Wind Bearing (degrees)','Visibility (km)','Pressure (millibars)']
X = df_normalized[features] 
y = df_normalized["Temperature (C)"]

## Linear Regression:

In [None]:
# split data into training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

In [None]:
# Function to fit simple linear regression to training set 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def LinearReg(X_train,X_test):
    
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = regressor.predict(X_test)
    
    # Plotting the graph showing the best fit predicted values
    plt.scatter(X_test, y_test, color = 'blue')
    plt.plot(X_test, y_pred, color = 'red')
    plt.show()

### 1. Performing Linear Regression on Apparent Temperature © as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Apparent Temperature ©'].values.reshape(-1,1),X_test['Apparent Temperature ©'].values.reshape(-1,1))

### 2. Performing Linear Regression on Humidity as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Humidity'].values.reshape(-1,1),X_test['Humidity'].values.reshape(-1,1))

### 3. Performing Linear Regression on Wind Speed (km/h) as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Wind Speed (km/h)'].values.reshape(-1,1),X_test['Wind Speed (km/h)'].values.reshape(-1,1))

### 4. Performing Linear Regression on Wind Bearing (degrees) as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Wind Bearing (degrees)'].values.reshape(-1,1),X_test['Wind Bearing (degrees)'].values.reshape(-1,1))

### 5. Performing Linear Regression on Visibility (km) as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Visibility (km)'].values.reshape(-1,1),X_test['Visibility (km)'].values.reshape(-1,1))

### 6. Performing Linear Regression on Pressure (millibars) as independent and Temperature as dependent variable

In [None]:
LinearReg(X_train['Pressure (millibars)'].values.reshape(-1,1),X_test['Pressure (millibars)'].values.reshape(-1,1))

## Performing Linear regression with all the features selected

In [None]:
# Fitting simple linear regression to training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the test set results
y_pred = regressor.predict(X_test)

Since we cannot plot a scatter plot for X variable which contains all the features , therefore we cannot visualise a graph and check the best predicted values. Therefore, we have found the root mean squared error to get an idea about how good the prediction is by including all the useful features.

In [None]:
# Finding the error in the prediction , lower the error better the accuracy

from sklearn.metrics import mean_squared_error
score = math.sqrt(mean_squared_error(y_test, y_pred))

score