# Execute the code below

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather2019.csv"
df_weather = pd.read_csv(link)

# Scoring and metrics
Last time, you did a multivariate linear regression. But how can you be sure this multivariate linear regression is better than an univariate ? You have to measure it !


## First regression
Let's begin with a first linear regression : create a new column `'predict_from_sun'` whith the prediction of MAX temperature from the SUNHOUR variable.

In [2]:
# Your code here :
X1 = df_weather[['SUNHOUR']]
y1 = df_weather['MAX_TEMPERATURE_C']
model_from_sun = LinearRegression().fit(X1,y1)


df_weather['predict_from_sun'] = pd.DataFrame(model_from_sun.predict(X1))

## R2 score
The best possible R2 score is '1', when our prediction predicts perfectly the reality. Let's see what is our R2 score :

In [3]:
# Change the name of the model if it's necessary
model_from_sun.score(X1, y1)

0.47654554059087306

## Let's continue with 2 others regressions
- Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable
- Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours)

In [4]:
# Your code here :
""" Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable """

X2 = df_weather[['MIN_TEMPERATURE_C']]
y2 = df_weather['MAX_TEMPERATURE_C']
model_from_min = LinearRegression().fit(X2,y2)

df_weather['predict_from_min'] = pd.DataFrame(model_from_min.predict(X2))


In [5]:
""" Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours) """

X3 = df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']]
y3 = df_weather['MAX_TEMPERATURE_C']
model_from_both = LinearRegression().fit(X3,y3)

df_weather['predict_from_both'] = pd.DataFrame(model_from_both.predict(X3))

## Calculate the R2 score of the 2 new predictions
Be careful : if you still use the same "X" name, you will overwrite it.

Which model has the best score ? Do you think it's logical ?

In [6]:
# Your code here :
print('R2 from Max:', model_from_sun.score(X1, y1))
print('R2 from Min:', model_from_min.score(X2, y2))
print('R2 from both:', model_from_both.score(X3, y3))

R2 from Max: 0.47654554059087306
R2 from Min: 0.7689396999057355
R2 from both: 0.867478798077497


# Train Test Split
One of biggest problems of Machine learning is : **overfitting**.



To be sure that machine didn't memorize the result, we use the Train Test Split methodology. We keep some data separate (often 25% of our initial dataset). Then we train our model on the 75% (the "Train set").
After, we can calculate a score on the "Test set".

Let's do that !

In [7]:
# Juste read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']

# Here, we split our 2 datasets (the variables "X" and the target "y") into 4 datasets X and y for the train set and X and y for the test set.
# We set the size of the train set to 75%. And the rest is for the test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.75)
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is   :", len(X_train))
print("The length of the test dataset is    :", len(X_test))

# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


The length of the initial dataset is : 365
The length of the train dataset is   : 273
The length of the test dataset is    : 92

Score for the Train dataset : 0.47243569075679914
Score for the Test dataset : 0.4749360350733982


## Both scores are very close, there is no overfitting, well done !

What happens if we don't randomize our dataset. Here, the model learns only on the 9 first months.

In [8]:
# Juste read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']

# We set the size of the train set to 75%. And the rest is for the test set.
# We set the split NOT in random.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, shuffle = False)


# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


Score for the Train dataset : 0.7875765302008688
Score for the Test dataset : 0.03610833322378593


## There is an overfitting !
Indeed, the model get a good score on the Train dataset, because he learned in winter / spring / summer datas. But he gets a bad score in Falls...

# Let's play !
Train a new model with all numeric variables (without your target of course) and try to have a better score than previously.

Remember to split randomly your dataset before training your model.

Display the Test score.

In [9]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DATE                    365 non-null    object 
 1   MAX_TEMPERATURE_C       365 non-null    int64  
 2   MIN_TEMPERATURE_C       365 non-null    int64  
 3   WINDSPEED_MAX_KMH       365 non-null    int64  
 4   TEMPERATURE_MORNING_C   365 non-null    int64  
 5   TEMPERATURE_NOON_C      365 non-null    int64  
 6   TEMPERATURE_EVENING_C   365 non-null    int64  
 7   PRECIP_TOTAL_DAY_MM     365 non-null    float64
 8   HUMIDITY_MAX_PERCENT    365 non-null    int64  
 9   VISIBILITY_AVG_KM       365 non-null    float64
 10  PRESSURE_MAX_MB         365 non-null    int64  
 11  CLOUDCOVER_AVG_PERCENT  365 non-null    float64
 12  HEATINDEX_MAX_C         365 non-null    int64  
 13  DEWPOINT_MAX_C          365 non-null    int64  
 14  WINDTEMP_MAX_C          365 non-null    in

In [10]:
df_weather.head(1)

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,predict_from_sun,predict_from_min,predict_from_both
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,176,0,1,5.1,very bad,1,1,11.396823,10.579999,8.980922


In [11]:
# Your code here :
X = df_weather[[
    'MIN_TEMPERATURE_C',
    'WINDSPEED_MAX_KMH',
    'TEMPERATURE_MORNING_C',
    'TEMPERATURE_NOON_C',
    'TEMPERATURE_EVENING_C',
    'PRECIP_TOTAL_DAY_MM',
    'HUMIDITY_MAX_PERCENT',
    'VISIBILITY_AVG_KM',
    'PRESSURE_MAX_MB',
    'CLOUDCOVER_AVG_PERCENT',
    'HEATINDEX_MAX_C',
    'DEWPOINT_MAX_C',
    'WINDTEMP_MAX_C',
    'WEATHER_CODE_MORNING',
    'WEATHER_CODE_NOON',
    'WEATHER_CODE_EVENING',
    'TOTAL_SNOW_MM',
    'UV_INDEX',
    'SUNHOUR',
    'MONTH',
    'DAY']]

y = df_weather['MAX_TEMPERATURE_C']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.75)
lastmodel = LinearRegression().fit(X_train, y_train)

print("Score for the Train dataset :", lastmodel.score(X_train, y_train))
print("Score for the Test dataset :", lastmodel.score(X_test, y_test))

Score for the Train dataset : 0.9933353831340122
Score for the Test dataset : 0.9953728575100915
