# Execute the code below

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather2019.csv"
df_weather = pd.read_csv(link)

In [14]:
display(df_weather.head())

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WINDTEMP_MAX_C,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,3,116,143,176,0,1,5.1,very bad,1,1
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,3,119,116,116,0,1,8.7,very bad,1,2
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,-4,116,116,116,0,1,8.7,very bad,1,3
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,-4,116,116,122,0,1,5.1,very bad,1,4
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,-2,143,116,116,0,1,8.7,very bad,1,5


# Scoring and metrics
Last time, you did a multivariate linear regression. But how can you be sure this multivariate linear regression is better than an univariate ? You have to measure it !


## First regression
Let's begin with a first linear regression : create a new column `'predict_from_sun'` whith the prediction of MAX temperature from the SUNHOUR variable.

In [15]:
# Your code here :
X = df_weather[["SUNHOUR"]]
y = df_weather["MAX_TEMPERATURE_C"]
model_from_sun = LinearRegression().fit(X, y)

print("coefficient :", model_from_sun.coef_)
print("intercept :", model_from_sun.intercept_)

print("By hand : ", model_from_sun.coef_ * 10 + model_from_sun.intercept_)
print("Scikit-Learn : ", model_from_sun.predict([[10]]))


coefficient : [1.28422116]
intercept : 4.8472953557195435
By hand :  [17.68950691]
Scikit-Learn :  [17.68950691]




In [16]:
df_weather["PREDICT_FROM_SUN"] = model_from_sun.predict(df_weather[["SUNHOUR"]])

In [17]:
display(df_weather.head())

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,PREDICT_FROM_SUN
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,116,143,176,0,1,5.1,very bad,1,1,11.396823
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,119,116,116,0,1,8.7,very bad,1,2,16.020019
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,116,116,0,1,8.7,very bad,1,3,16.020019
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,116,116,122,0,1,5.1,very bad,1,4,11.396823
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,143,116,116,0,1,8.7,very bad,1,5,16.020019


## R2 score
The best possible R2 score is '1', when our prediction predicts perfectly the reality. Let's see what is our R2 score :

In [18]:
# Change the name of the model if it's necessary
model_from_sun.score(X, y)

0.47654554059087306

## Let's continue with 2 others regressions
- Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable
- Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours)

In [19]:
# Your code here :

X = df_weather[["MIN_TEMPERATURE_C"]]
y = df_weather["MAX_TEMPERATURE_C"]
model_from_min = LinearRegression().fit(X, y)

print("coefficient :", model_from_min.coef_)
print("intercept :", model_from_min.intercept_)

print("By hand : ", model_from_min.coef_ * 10 + model_from_min.intercept_)
print("Scikit-Learn : ", model_from_min.predict([[10]]))

coefficient : [1.22274205]
intercept : 5.689031214578716
By hand :  [17.91645172]
Scikit-Learn :  [17.91645172]




In [20]:
df_weather["PREDICT_FROM_MIN"] = model_from_min.predict(df_weather[["MIN_TEMPERATURE_C"]])

In [21]:
display(df_weather.head())

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,PREDICT_FROM_SUN,PREDICT_FROM_MIN
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,143,176,0,1,5.1,very bad,1,1,11.396823,10.579999
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,116,116,0,1,8.7,very bad,1,2,16.020019,11.802741
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,116,0,1,8.7,very bad,1,3,16.020019,5.689031
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,116,122,0,1,5.1,very bad,1,4,11.396823,4.466289
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,116,116,0,1,8.7,very bad,1,5,16.020019,4.466289


In [25]:
X = df_weather[["MIN_TEMPERATURE_C", "SUNHOUR"]]
y = df_weather["MAX_TEMPERATURE_C"]
model_from_both = LinearRegression().fit(X, y)

print("coefficient :", model_from_both.coef_)
print("intercept :", model_from_both.intercept_)

print("By hand : ", model_from_both.coef_ * 10 + model_from_both.intercept_)
print("Scikit-Learn : ", model_from_both.predict([[10, 10]]))

coefficient : [0.98867383 0.66222402]
intercept : 1.64888373125968
By hand :  [11.535622    8.27112392]
Scikit-Learn :  [18.15786219]




In [28]:
df_weather["PREDICT_FROM_BOTH"] = model_from_both.predict(df_weather[["MIN_TEMPERATURE_C", "SUNHOUR"]])

In [29]:
display(df_weather.head())

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,PREDICT_FROM_SUN,PREDICT_FROM_MIN,PREDICT_FROM_BOTH
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,176,0,1,5.1,very bad,1,1,11.396823,10.579999,8.980922
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,116,0,1,8.7,very bad,1,2,16.020019,11.802741,12.353602
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,0,1,8.7,very bad,1,3,16.020019,5.689031,7.410233
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,122,0,1,5.1,very bad,1,4,11.396823,4.466289,4.037552
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,116,0,1,8.7,very bad,1,5,16.020019,4.466289,6.421559


## Calculate the R2 score of the 2 new predictions
Be careful : if you still use the same "X" name, you will overwrite it.

Which model has the best score ? Do you think it's logical ?

In [39]:
X = df_weather[["MIN_TEMPERATURE_C"]]
model_from_min.score(X,y)

0.7689396999057355

In [40]:
X = df_weather[["MIN_TEMPERATURE_C", "SUNHOUR"]]
model_from_both.score(X,y)

0.8674787980774968

the model_from_both has the best score
it seems logical because model can work on more data

# Train Test Split
One of biggest problems of Machine learning is : **overfitting**.



To be sure that machine didn't memorize the result, we use the Train Test Split methodology. We keep some data separate (often 25% of our initial dataset). Then we train our model on the 75% (the "Train set").
After, we can calculate a score on the "Test set".

Let's do that !

In [41]:
# Juste read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']

# Here, we split our 2 datasets (the variables "X" and the target "y") into 4 datasets X and y for the train set and X and y for the test set.
# We set the size of the train set to 75%. And the rest is for the test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.75)
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is   :", len(X_train))
print("The length of the test dataset is    :", len(X_test))

# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


The length of the initial dataset is : 365
The length of the train dataset is   : 273
The length of the test dataset is    : 92

Score for the Train dataset : 0.47243569075679914
Score for the Test dataset : 0.4749360350733982


## Both scores are very close, there is no overfitting, well done !

What happens if we don't randomize our dataset. Here, the model learns only on the 9 first months.

In [42]:
# Juste read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']

# We set the size of the train set to 75%. And the rest is for the test set.
# We set the split NOT in random.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, shuffle = False)


# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


Score for the Train dataset : 0.7875765302008688
Score for the Test dataset : 0.03610833322378593


## There is an overfitting !
Indeed, the model get a good score on the Train dataset, because he learned in winter / spring / summer datas. But he gets a bad score in Falls...

# Let's play !
Train a new model with all numeric variables (without your target of course) and try to have a better score than previously.

Remember to split randomly your dataset before training your model.

Display the Test score.

In [44]:
# Your code here :

X = df_weather[["MIN_TEMPERATURE_C", "WINDSPEED_MAX_KMH",
                "TEMPERATURE_MORNING_C", "TEMPERATURE_NOON_C",
                "TEMPERATURE_EVENING_C", "PRECIP_TOTAL_DAY_MM",
                "HUMIDITY_MAX_PERCENT", "VISIBILITY_AVG_KM",
                "TOTAL_SNOW_MM", "SUNHOUR"]]
y = df_weather["MAX_TEMPERATURE_C"]
my_new_model = LinearRegression().fit(X, y)

print("coefficient :", my_new_model.coef_)
print("intercept :", my_new_model.intercept_)

print("By hand : ", my_new_model.coef_ * 10 + my_new_model.intercept_)
print("Scikit-Learn : ", my_new_model.predict([[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]]))

coefficient : [ 0.39963091 -0.0311005  -0.23077146  0.45377035  0.37635714  0.00518889
 -0.03477436  0.02010889  0.          0.06365341]
intercept : 5.447976377388407
By hand :  [9.44428547 5.13697135 3.14026175 9.9856799  9.2115478  5.49986532
 5.1002328  5.64906528 5.44797638 6.0845105 ]
Scikit-Learn :  [15.66860916]




In [45]:
from sklearn.model_selection import train_test_split

X = df_weather[["MIN_TEMPERATURE_C", "WINDSPEED_MAX_KMH",
                "TEMPERATURE_MORNING_C", "TEMPERATURE_NOON_C",
                "TEMPERATURE_EVENING_C", "PRECIP_TOTAL_DAY_MM",
                "HUMIDITY_MAX_PERCENT", "VISIBILITY_AVG_KM",
                "TOTAL_SNOW_MM", "SUNHOUR"]]
y = df_weather['MAX_TEMPERATURE_C']

# We set the size of the train set to 75%. And the rest is for the test set.
# We set the split NOT in random.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, shuffle = False)


# Here we train the model only on the train dataset.
train_new_model = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", train_new_model.score(X_train, y_train))
print("Score for the Test dataset :", train_new_model.score(X_test, y_test))


Score for the Train dataset : 0.9786680384616635
Score for the Test dataset : 0.855378694612688


Score is way better