In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![deslizamentos_rio.jpg](attachment:deslizamentos_rio.jpg)

# Summary

Hello guys, we are once again conducting an analysis with a forecast of a weather dataset.
One of my favorites to work with. In this analysis we will make visualizations about the
features and their relations with each other and in the end we will classify a forecast of
rain risk for the next day or the conditions that cause them.
These data are not from the city of Rio de Janeiro, however, it is a city plagued every year with heavy rains, where people have already lost their lives with landslides, lost their belongings with constant flooding and through machine learning we could help to predict these catastrophes around the world.
 

1. Import the Main Libraries;
2. Functions to resize plots;
3. Load de DS;
4. Visualization of missing data;
5. Treatment of missing data using KNNImputer;
6. Delete the "Risk_MM" column;
7. Starting the analysis;
8. Binarizing the categorical features;
9. Encoding it;
10. Split our data;
11. Start with the algorithms;
12. Conclusions about the best Accuracy;
13. Final considerations.


# Let's get started!

* First, we import the libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

* I define here some functions for better visualization of the posterior plots.

In [None]:
def resizeplot():
    plt.figure(figsize=(12,6))

In [None]:
def resizecorr():
    plt.figure(figsize=(15,7))

#This function it is for correlations plots

*  Load the data:

In [None]:
df = pd.read_csv('/kaggle/input/weathercsv/weather.csv')

In [None]:
df.head()

* Let's see if there are missing data.

In [None]:
df.isnull().sum()

* We have some missing data. We will visualize them better in the graph.

In [None]:
resizeplot()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

* There are several ways to treat missing data. Each with its own strengths and weaknesses. A common strategy is to fill the data with the averages but it is usually not as close to the real value as if we used KNN, it is a good solution for small datasets.

In [None]:
from sklearn.impute import KNNImputer

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer_int = KNNImputer(missing_values=np.nan)

In [None]:
df['Sunshine'] = imputer_int.fit_transform(df[['Sunshine']])

In [None]:
imputer_str = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

In [None]:
df['WindGustDir'] = imputer_str.fit_transform(df[['WindGustDir']])

In [None]:
df['WindGustSpeed'] = imputer_int.fit_transform(df[['WindGustSpeed']])

In [None]:
df['WindDir9am'] = imputer_str.fit_transform(df[['WindDir9am']])

In [None]:
df['WindDir3pm'] = imputer_str.fit_transform(df[['WindDir3pm']])

In [None]:
df['WindSpeed9am'] = imputer_int.fit_transform(df[['WindSpeed9am']])

In [None]:
resizeplot()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Clean!

* I know, I know I could create a function and pass the features as parameters but I prefer to do it with the 'nail'.According to instructions in the dataset, it is advisable to exclude the Risk_MM column so that it does not interfere with future forecasts.

In [None]:
df.drop('RISK_MM', inplace=True,axis=1)

In [None]:
df.head()

* This is one of the parts that say a lot about our dataset. The correlation graph shows us the relationship between the features and how they can influence others.

In [None]:
resizecorr()
sns.heatmap(df.corr(),annot=True)

* We see a good correlation between 'MinTemp' and 'Temp9am' which makes sense where the lowest temperatures are in the morning.

In [None]:
sns.relplot(x='MinTemp',y='Temp9am',data=df)

* I confess that I find a beauty a graph of the relationship between two features relating in an almost perfect way.
It is the same as the situation above but with the features 'MaxTemp' and 'Temp3pm' where they also make sense because the higher temperatures are after 12:00h.

In [None]:
sns.relplot(x='MaxTemp',y='Temp3pm',data=df)

* The graph below shows the relationship between Sunshine and Temp3pm, which would lead us to believe that it would be a very high correlation apparently, but as we can see, we do not have this correlation. Some factors can contribute to this, such as the amount of clouds in the afternoon where we do not have a high rate of solar illumination.

In [None]:
sns.relplot(x='Sunshine',y='Temp3pm',data=df)

* Below is a comparison of two features, the wind speed at two times, 9:00 am and 3:00 pm.

In [None]:
resizeplot()
sns.distplot(df['WindSpeed9am'])
sns.distplot(df['WindSpeed3pm'])

* Below, we have two features that are very complex to predict or to follow. The wind directions, in both plots, there are varied differences and amplitudes, which also brings us the idea that these two variables are influenced by factors such as the speed variation with the height, presence of obstacles in the surroundings, relief that can cause an effect of acceleration or deceleration in air flow, etc.

In [None]:
fig, ax =plt.subplots(1,2,figsize= (15,6))
sns.countplot(df['WindDir9am'],ax=ax[0])
sns.countplot(df['WindDir3pm'],ax=ax[1])

* Well, technically the lower the relative humidity, the less water vapor in the air and the more “easily” the water will evaporate. In the extreme case, if the relative humidity is 100%, the same amount of water that evaporates from the water condenses in it, resulting in a net zero evaporation rate.

In [None]:
resizeplot()
sns.scatterplot(df['Humidity3pm'],df['Evaporation'])

* In this distribution graph we observed very similar data, which leads us to believe that the temperatures acting on the pressure did not vary much between the hours.

In [None]:
fig, ax =plt.subplots(1,2,figsize= (15,6))
sns.distplot(df['Pressure9am'],ax=ax[0])
sns.distplot(df['Pressure3pm'],ax=ax[1])

* This scatter plot shows that the relationship between pressure in the morning and afternoon is very similar, but we can see that in the afternoon (3pm) the dispersion is more dense in certain variations. That's explain the distplot above.

In [None]:
fig, ax =plt.subplots(1,2,figsize= (13,6))
sns.scatterplot(data=df,x='Pressure9am',y='MinTemp',ax=ax[0])
sns.scatterplot(data=df,x='Pressure3pm',y='MaxTemp',ax=ax[1],color='red')

* The graphs below are very similar, which leads us to consider that humidity and evaporation are similar at certain times, which leads to similar cloud formations.

In [None]:
fig, ax =plt.subplots(1,2,figsize= (13,6))
sns.distplot(df['Cloud9am'],ax=ax[0])
sns.distplot(df['Cloud3pm'],ax=ax[1])

* Here we take a closer look at the relationship between evaporation and cloud formation. We obviously have a higher rate in the morning due to relatively higher humidity but in the afternoon we have more uniform rates according to the time.

In [None]:
fig, ax =plt.subplots(1,2,figsize= (13,6))
sns.scatterplot(data=df, x='Cloud9am',y='Humidity9am',ax=ax[0])
sns.scatterplot(data=df,x='Cloud3pm',y='Humidity3pm',ax=ax[1])

* In this "groupby" we have important information about the possible conditions that may or may not cause rain conditions.

In [None]:
df[['Rainfall','Sunshine','Evaporation','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Humidity9am',
    'Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am',
    'Temp3pm']].groupby(df['RainToday']).mean()

* We will use LabelBinarizer to handle categorical data.

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
LB = LabelBinarizer()

In [None]:
df['WindGustDir'] = LB.fit_transform(df[['WindGustDir']])

In [None]:
df['WindDir9am'] = LB.fit_transform(df[['WindDir9am']])

In [None]:
df['WindDir3pm'] = LB.fit_transform(df[['WindDir3pm']])

* Here, we change the Labels of our main forecasts.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
LE = LabelEncoder()

In [None]:
df['RainToday'] = LE.fit_transform(df['RainToday'])

In [None]:
df['RainTomorrow'] = LE.fit_transform(df['RainTomorrow'])

* Let's see how the data is doing.

In [None]:
df.head()

* We separate the training and test variables.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('RainTomorrow',axis=1).values
y = df['RainTomorrow'].values

* We will standardize our information, as they are on very different scales.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
minmax = MinMaxScaler()

In [None]:
X = minmax.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

* Let's go to the predictor algorithms.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
previsor_dtree = dtree.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
print(classification_report(y_test,previsor_dtree))
print(confusion_matrix(y_test,previsor_dtree))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,previsor_dtree),3)*100,'%')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
previsor_rfc = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
print(classification_report(y_test,previsor_rfc))
print(confusion_matrix(y_test,previsor_rfc))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,previsor_rfc),3)*100,'%')

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
model.fit(X_train, y_train)

In [None]:
previsor_svc = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,previsor_svc))
print(confusion_matrix(y_test,previsor_svc))
print('\n')
print('Acurácia:',np.round(accuracy_score(y_test,previsor_svc),3)*100,'%')

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR = LogisticRegression()

In [None]:
LR.fit(X_train,y_train)

In [None]:
predict_LR = LR.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
print(classification_report(y_test,predict_LR))
print(confusion_matrix(y_test,predict_LR))
print('\n')
print('Acurácia:', np.round(accuracy_score(y_test,predict_LR),3)*100,'%')

In [None]:
Acuracia_LR = np.round(accuracy_score(y_test,predict_LR),3) *100
Acuracia_svc = np.round(accuracy_score(y_test,previsor_svc),3)*100
Acuracia_rfc = np.round(accuracy_score(y_test,previsor_rfc),3)*100
Acuracia_dtree = np.round(accuracy_score(y_test,previsor_dtree),3)*100


In [None]:
print('Acurácia_Decision_tree: ',Acuracia_dtree,'%')
print('Acurácia_Random_Forest: ', Acuracia_rfc,'%')
print('Acurácia_LR: ', Acuracia_LR,'%')
print('Acurácia_SVC: ', Acuracia_svc,'%')

# We then had a better assessment of our Logistic Regression with an 89.3% chance of not raining with this information.
# Well folks, I hope this analysis was useful, that better and sunny days arrive for all of us. If you liked it please do not forget to give an upvote and if not please comment so that I can learn more under your own analysis. Take care.